From 1fa93d30bf0ad410eb301a8584f6eef582a1dac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Bidar?= Date: Mon, 21 Nov 2022 15:06:09 +0200 Subject: [PATCH 1/2] packaging: Explicitly require gcc and make MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Björn Bidar --- rpm/openssl.spec | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rpm/openssl.spec b/rpm/openssl.spec index 98e0dda..b185770 100644 --- a/rpm/openssl.spec +++ b/rpm/openssl.spec @@ -83,6 +83,8 @@ Patch56: openssl-1.1.1-s390x-ecc.patch License: OpenSSL URL: http://www.openssl.org/ +BuildRequires: make +BuildRequires: gcc BuildRequires: coreutils, perl, sed, zlib-devel BuildRequires: lksctp-tools-devel # /usr/bin/cmp From cddac481dd0e83049149cb45ef8e946de03b8f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Bidar?= Date: Mon, 21 Nov 2022 15:32:50 +0200 Subject: [PATCH 2/2] [openssl] Update to 1.1.1s, pick performance backports. Fixes JB#59047 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following CVE's are addressed with this update: - CVE-2022-0778 - CVE-2022-1292 - CVE-2022-2068 - CVE-2022-2097 Also rebase patches namely openssl-1.1.1-arm-update.patch which was a backport from upstream commit d6e4287c9726691e800bff221be71edd894a3c6a after a8f6d73fda64d514171e99a50d1483c0c0b8d968 was merged. Next pick performance patches backported by OpenSUSE. Signed-off-by: Björn Bidar --- openssl | 2 +- ...-armx.pl-20-improvement-on-ThunderX2.patch | 464 + rpm/openssl-1.1.1-arm-update.patch | 3706 -------- rpm/openssl-1.1.1-evp-kdf.patch | 9 +- rpm/openssl-1.1.1-fips.patch | 10 +- rpm/openssl-1.1.1-s390x-ecc.patch | 2 +- rpm/openssl-1.1.1-system-cipherlist.patch | 7 +- rpm/openssl-1.1.1-version-override.patch | 13 +- rpm/openssl-1_1-Optimize-AES-GCM-uarchs.patch | 7709 +++++++++++++++++ ...openssl-1_1-Optimize-AES-XTS-aarch64.patch | 1616 ++++ rpm/openssl-1_1-Optimize-RSA-armv8.patch | 575 ++ rpm/openssl.spec | 7 +- 12 files changed, 10384 insertions(+), 3736 deletions(-) create mode 100644 rpm/openssl-1.1.1-aes-asm-aesv8-armx.pl-20-improvement-on-ThunderX2.patch delete mode 100644 rpm/openssl-1.1.1-arm-update.patch create mode 100644 rpm/openssl-1_1-Optimize-AES-GCM-uarchs.patch create mode 100644 rpm/openssl-1_1-Optimize-AES-XTS-aarch64.patch create mode 100644 rpm/openssl-1_1-Optimize-RSA-armv8.patch diff --git a/openssl b/openssl index fb047eb..1290581 160000 --- a/openssl +++ b/openssl @@ -1 +1 @@ -Subproject commit fb047ebc87b18bdc4cf9ddee9ee1f5ed93e56aff +Subproject commit 129058165d195e43a0ad10111b0c2e29bdf65980 diff --git a/rpm/openssl-1.1.1-aes-asm-aesv8-armx.pl-20-improvement-on-ThunderX2.patch b/rpm/openssl-1.1.1-aes-asm-aesv8-armx.pl-20-improvement-on-ThunderX2.patch new file mode 100644 index 0000000..c93e73c --- /dev/null +++ b/rpm/openssl-1.1.1-aes-asm-aesv8-armx.pl-20-improvement-on-ThunderX2.patch @@ -0,0 +1,464 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Polyakov +Date: Wed, 17 Apr 2019 21:30:39 +0200 +Subject: [PATCH] aes/asm/aesv8-armx.pl: ~20% improvement on ThunderX2. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reviewed-by: Tim Hudson +Reviewed-by: Richard Levitte +Signed-off-by: Björn Bidar +(Merged from https://github.com/openssl/openssl/pull/8776) +--- + crypto/aes/asm/aesv8-armx.pl | 394 ++++++++++++++++++++++++++++++++++- + 1 file changed, 389 insertions(+), 5 deletions(-) + +diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl +index 7edc436a53ca0500c1cf84b72f65d4a8c7a65942..23cff11ad3701dcfff4b559acf115d0fd2fa78e9 100755 +--- a/crypto/aes/asm/aesv8-armx.pl ++++ b/crypto/aes/asm/aesv8-armx.pl +@@ -27,18 +27,34 @@ + # CBC encrypt case. On Cortex-A57 parallelizable mode performance + # seems to be limited by sheer amount of NEON instructions... + # ++# April 2019 ++# ++# Key to performance of parallelize-able modes is round instruction ++# interleaving. But which factor to use? There is optimal one for ++# each combination of instruction latency and issue rate, beyond ++# which increasing interleave factor doesn't pay off. While on cons ++# side we have code size increase and resource waste on platforms for ++# which interleave factor is too high. In other words you want it to ++# be just right. So far interleave factor of 3x was serving well all ++# platforms. But for ThunderX2 optimal interleave factor was measured ++# to be 5x... ++# + # Performance in cycles per byte processed with 128-bit key: + # + # CBC enc CBC dec CTR + # Apple A7 2.39 1.20 1.20 +-# Cortex-A53 1.32 1.29 1.46 +-# Cortex-A57(*) 1.95 0.85 0.93 +-# Denver 1.96 0.86 0.80 +-# Mongoose 1.33 1.20 1.20 +-# Kryo 1.26 0.94 1.00 ++# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 ++# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 ++# Cortex-A72 1.33 0.85/0.88 0.92/0.96 ++# Denver 1.96 0.65/0.86 0.76/0.80 ++# Mongoose 1.33 1.23/1.20 1.30/1.20 ++# Kryo 1.26 0.87/0.94 1.00/1.00 ++# ThunderX2 5.95 1.25 1.30 + # + # (*) original 3.64/1.34/1.32 results were for r0p0 revision + # and are still same even for updated module; ++# (**) numbers after slash are for 32-bit code, which is 3x- ++# interleaved; + + $flavour = shift; + $output = shift; +@@ -519,6 +535,13 @@ $code.=<<___; + ___ + { + my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ + $code.=<<___; + .align 5 + .Lcbc_dec: +@@ -535,7 +558,196 @@ $code.=<<___; + vorr $in0,$dat,$dat + vorr $in1,$dat1,$dat1 + vorr $in2,$dat2,$dat2 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_cbc_dec ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ vorr $in3,$dat3,$dat3 ++ vorr $in4,$dat4,$dat4 ++ ++.Loop5x_cbc_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_cbc_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lcbc_tail4x ++ sub $len,$len,#0x50 ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lcbc_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 + ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$ivec,$rndlast ++ aesd $dat0,q15 ++ veor $tmp1,$in0,$rndlast ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ veor $tmp2,$in1,$rndlast ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ veor $tmp3,$in2,$rndlast ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ veor $tmp4,$in3,$rndlast ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ vorr $ivec,$in4,$in4 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lcbc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$tmp1,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$tmp3,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_cbc_dec ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lcbc_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $in0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $in1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ vorr $in2,$in4,$in4 ++ b.lo .Lcbc_dec_tail ++ ++ b .Loop3x_cbc_dec ++ ++.align 4 ++.Lcbc_tail4x: ++ veor $tmp1,$tmp0,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lcbc_done ++.align 4 ++___ ++$code.=<<___; + .Loop3x_cbc_dec: + aesd $dat0,q8 + aesimc $dat0,$dat0 +@@ -696,6 +908,9 @@ my $step="x12"; # aliases with $tctr2 + my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + ++# used only in 64-bit mode... ++my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); ++ + my ($dat,$tmp)=($dat0,$tmp0); + + ### q8-q15 preloaded key schedule +@@ -754,6 +969,7 @@ $code.=<<___ if ($flavour =~ /64/); + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 + ___ ++ + $code.=<<___ if ($flavour !~ /64/); + add $tctr1, $ctr, #1 + vorr $ivec,$dat0,$dat0 +@@ -767,6 +983,174 @@ $code.=<<___ if ($flavour !~ /64/); + sub $len,$len,#3 // bias + vorr $dat2,$ivec,$ivec + ___ ++ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#2 ++ b.lo .Loop3x_ctr32 ++ ++ add w13,$ctr,#1 ++ add w14,$ctr,#2 ++ vorr $dat3,$dat0,$dat0 ++ rev w13,w13 ++ vorr $dat4,$dat0,$dat0 ++ rev w14,w14 ++ vmov.32 ${dat3}[3],w13 ++ sub $len,$len,#2 // bias ++ vmov.32 ${dat4}[3],w14 ++ add $ctr,$ctr,#2 ++ b .Loop5x_ctr32 ++ ++.align 4 ++.Loop5x_ctr32: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ctr32 ++ ++ mov $key_,$key ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ add $tctr0,$ctr,#1 ++ add $tctr1,$ctr,#2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ add $tctr2,$ctr,#3 ++ add w13,$ctr,#4 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ add w14,$ctr,#5 ++ rev $tctr0,$tctr0 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ rev $tctr1,$tctr1 ++ rev $tctr2,$tctr2 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ rev w13,w13 ++ rev w14,w14 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ vld1.8 {$in4},[$inp],#16 ++ ++ aese $dat0,q15 ++ veor $in0,$in0,$rndlast ++ aese $dat1,q15 ++ veor $in1,$in1,$rndlast ++ aese $dat2,q15 ++ veor $in2,$in2,$rndlast ++ aese $dat3,q15 ++ veor $in3,$in3,$rndlast ++ aese $dat4,q15 ++ veor $in4,$in4,$rndlast ++ ++ veor $in0,$in0,$dat0 ++ vorr $dat0,$ivec,$ivec ++ veor $in1,$in1,$dat1 ++ vorr $dat1,$ivec,$ivec ++ veor $in2,$in2,$dat2 ++ vorr $dat2,$ivec,$ivec ++ veor $in3,$in3,$dat3 ++ vorr $dat3,$ivec,$ivec ++ veor $in4,$in4,$dat4 ++ vorr $dat4,$ivec,$ivec ++ ++ vst1.8 {$in0},[$out],#16 ++ vmov.32 ${dat0}[3],$tctr0 ++ vst1.8 {$in1},[$out],#16 ++ vmov.32 ${dat1}[3],$tctr1 ++ vst1.8 {$in2},[$out],#16 ++ vmov.32 ${dat2}[3],$tctr2 ++ vst1.8 {$in3},[$out],#16 ++ vmov.32 ${dat3}[3],w13 ++ vst1.8 {$in4},[$out],#16 ++ vmov.32 ${dat4}[3],w14 ++ ++ mov $cnt,$rounds ++ cbz $len,.Lctr32_done ++ ++ add $ctr,$ctr,#5 ++ subs $len,$len,#5 ++ b.hs .Loop5x_ctr32 ++ ++ add $len,$len,#5 ++ sub $ctr,$ctr,#5 ++ ++ cmp $len,#2 ++ mov $step,#16 ++ cclr $step,lo ++ b.ls .Lctr32_tail ++ ++ sub $len,$len,#3 // bias ++ add $ctr,$ctr,#3 ++___ + $code.=<<___; + b .Loop3x_ctr32 + diff --git a/rpm/openssl-1.1.1-arm-update.patch b/rpm/openssl-1.1.1-arm-update.patch deleted file mode 100644 index 2b8c549..0000000 --- a/rpm/openssl-1.1.1-arm-update.patch +++ /dev/null @@ -1,3706 +0,0 @@ -diff -up openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl ---- openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl 2020-12-09 10:39:50.645705385 +0100 -@@ -27,44 +27,72 @@ - # CBC encrypt case. On Cortex-A57 parallelizable mode performance - # seems to be limited by sheer amount of NEON instructions... - # -+# April 2019 -+# -+# Key to performance of parallelize-able modes is round instruction -+# interleaving. But which factor to use? There is optimal one for -+# each combination of instruction latency and issue rate, beyond -+# which increasing interleave factor doesn't pay off. While on cons -+# side we have code size increase and resource waste on platforms for -+# which interleave factor is too high. In other words you want it to -+# be just right. So far interleave factor of 3x was serving well all -+# platforms. But for ThunderX2 optimal interleave factor was measured -+# to be 5x... -+# - # Performance in cycles per byte processed with 128-bit key: - # - # CBC enc CBC dec CTR - # Apple A7 2.39 1.20 1.20 --# Cortex-A53 1.32 1.29 1.46 --# Cortex-A57(*) 1.95 0.85 0.93 --# Denver 1.96 0.86 0.80 --# Mongoose 1.33 1.20 1.20 --# Kryo 1.26 0.94 1.00 -+# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 -+# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 -+# Cortex-A72 1.33 0.85/0.88 0.92/0.96 -+# Denver 1.96 0.65/0.86 0.76/0.80 -+# Mongoose 1.33 1.23/1.20 1.30/1.20 -+# Kryo 1.26 0.87/0.94 1.00/1.00 -+# ThunderX2 5.95 1.25 1.30 - # - # (*) original 3.64/1.34/1.32 results were for r0p0 revision - # and are still same even for updated module; -+# (**) numbers after slash are for 32-bit code, which is 3x- -+# interleaved; - --$flavour = shift; --$output = shift; -+# $output is the last argument if it looks like a file (it has an extension) -+# $flavour is the first argument if it doesn't look like a file -+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; -+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; - - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - --open OUT,"| \"$^X\" $xlate $flavour $output"; -+open OUT,"| \"$^X\" $xlate $flavour \"$output\"" -+ or die "can't call $xlate: $!"; - *STDOUT=*OUT; - - $prefix="aes_v8"; - -+$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); -+ - $code=<<___; - #include "arm_arch.h" - - #if __ARM_MAX_ARCH__>=7 --.text - ___ --$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); - $code.=<<___ if ($flavour !~ /64/); - .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) - .fpu neon -+#ifdef __thumb2__ -+.syntax unified -+.thumb -+# define INST(a,b,c,d) $_byte c,d|0xc,a,b -+#else - .code 32 --#undef __thumb2__ -+# define INST(a,b,c,d) $_byte a,b,c,d -+#endif -+ -+.text - ___ - - # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, -@@ -361,6 +389,836 @@ ___ - &gen_block("en"); - &gen_block("de"); - }}} -+ -+# Performance in cycles per byte. -+# Processed with AES-ECB different key size. -+# It shows the value before and after optimization as below: -+# (before/after): -+# -+# AES-128-ECB AES-192-ECB AES-256-ECB -+# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 -+# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 -+ -+# Optimization is implemented by loop unrolling and interleaving. -+# Commonly, we choose the unrolling factor as 5, if the input -+# data size smaller than 5 blocks, but not smaller than 3 blocks, -+# choose 3 as the unrolling factor. -+# If the input data size dsize >= 5*16 bytes, then take 5 blocks -+# as one iteration, every loop the left size lsize -= 5*16. -+# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, -+# every loop lsize -=3*16. -+# If lsize < 3*16 bytes, treat them as the tail, interleave the -+# two blocks AES instructions. -+# There is one special case, if the original input data size dsize -+# = 16 bytes, we will treat it seperately to improve the -+# performance: one independent code block without LR, FP load and -+# store, just looks like what the original ECB implementation does. -+ -+{{{ -+my ($inp,$out,$len,$key)=map("x$_",(0..3)); -+my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); -+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); -+ -+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); -+ -+### q7 last round key -+### q10-q15 q7 Last 7 round keys -+### q8-q9 preloaded round keys except last 7 keys for big size -+### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte -+ -+{ -+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); -+ -+my ($dat3,$in3,$tmp3); # used only in 64-bit mode -+my ($dat4,$in4,$tmp4); -+if ($flavour =~ /64/) { -+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); -+} -+ -+$code.=<<___; -+.globl ${prefix}_ecb_encrypt -+.type ${prefix}_ecb_encrypt,%function -+.align 5 -+${prefix}_ecb_encrypt: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ subs $len,$len,#16 -+ // Original input data size bigger than 16, jump to big size processing. -+ b.ne .Lecb_big_size -+ vld1.8 {$dat0},[$inp] -+ cmp $enc,#0 // en- or decrypting? -+ ldr $rounds,[$key,#240] -+ vld1.32 {q5-q6},[$key],#32 // load key schedule... -+ -+ b.eq .Lecb_small_dec -+ aese $dat0,q5 -+ aesmc $dat0,$dat0 -+ vld1.32 {q8-q9},[$key],#32 // load key schedule... -+ aese $dat0,q6 -+ aesmc $dat0,$dat0 -+ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing -+ b.eq .Lecb_128_enc -+.Lecb_round_loop: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ vld1.32 {q8},[$key],#16 // load key schedule... -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ vld1.32 {q9},[$key],#16 // load key schedule... -+ subs $rounds,$rounds,#2 // bias -+ b.gt .Lecb_round_loop -+.Lecb_128_enc: -+ vld1.32 {q10-q11},[$key],#32 // load key schedule... -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ vld1.32 {q12-q13},[$key],#32 // load key schedule... -+ aese $dat0,q10 -+ aesmc $dat0,$dat0 -+ aese $dat0,q11 -+ aesmc $dat0,$dat0 -+ vld1.32 {q14-q15},[$key],#32 // load key schedule... -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ vld1.32 {$rndlast},[$key] -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat0,q15 -+ veor $dat0,$dat0,$rndlast -+ vst1.8 {$dat0},[$out] -+ b .Lecb_Final_abort -+.Lecb_small_dec: -+ aesd $dat0,q5 -+ aesimc $dat0,$dat0 -+ vld1.32 {q8-q9},[$key],#32 // load key schedule... -+ aesd $dat0,q6 -+ aesimc $dat0,$dat0 -+ subs $rounds,$rounds,#10 // bias -+ b.eq .Lecb_128_dec -+.Lecb_dec_round_loop: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ vld1.32 {q8},[$key],#16 // load key schedule... -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ vld1.32 {q9},[$key],#16 // load key schedule... -+ subs $rounds,$rounds,#2 // bias -+ b.gt .Lecb_dec_round_loop -+.Lecb_128_dec: -+ vld1.32 {q10-q11},[$key],#32 // load key schedule... -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ vld1.32 {q12-q13},[$key],#32 // load key schedule... -+ aesd $dat0,q10 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q11 -+ aesimc $dat0,$dat0 -+ vld1.32 {q14-q15},[$key],#32 // load key schedule... -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ vld1.32 {$rndlast},[$key] -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q15 -+ veor $dat0,$dat0,$rndlast -+ vst1.8 {$dat0},[$out] -+ b .Lecb_Final_abort -+.Lecb_big_size: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ stp x29,x30,[sp,#-16]! -+ add x29,sp,#0 -+___ -+$code.=<<___ if ($flavour !~ /64/); -+ mov ip,sp -+ stmdb sp!,{r4-r8,lr} -+ vstmdb sp!,{d8-d15} @ ABI specification says so -+ ldmia ip,{r4-r5} @ load remaining args -+ subs $len,$len,#16 -+___ -+$code.=<<___; -+ mov $step,#16 -+ b.lo .Lecb_done -+ cclr $step,eq -+ -+ cmp $enc,#0 // en- or decrypting? -+ ldr $rounds,[$key,#240] -+ and $len,$len,#-16 -+ vld1.8 {$dat},[$inp],$step -+ -+ vld1.32 {q8-q9},[$key] // load key schedule... -+ sub $rounds,$rounds,#6 -+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys -+ sub $rounds,$rounds,#2 -+ vld1.32 {q10-q11},[$key_],#32 -+ vld1.32 {q12-q13},[$key_],#32 -+ vld1.32 {q14-q15},[$key_],#32 -+ vld1.32 {$rndlast},[$key_] -+ -+ add $key_,$key,#32 -+ mov $cnt,$rounds -+ b.eq .Lecb_dec -+ -+ vld1.8 {$dat1},[$inp],#16 -+ subs $len,$len,#32 // bias -+ add $cnt,$rounds,#2 -+ vorr $in1,$dat1,$dat1 -+ vorr $dat2,$dat1,$dat1 -+ vorr $dat1,$dat,$dat -+ b.lo .Lecb_enc_tail -+ -+ vorr $dat1,$in1,$in1 -+ vld1.8 {$dat2},[$inp],#16 -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#32 -+ b.lo .Loop3x_ecb_enc -+ -+ vld1.8 {$dat3},[$inp],#16 -+ vld1.8 {$dat4},[$inp],#16 -+ sub $len,$len,#32 // bias -+ mov $cnt,$rounds -+ -+.Loop5x_ecb_enc: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop5x_ecb_enc -+ -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ cmp $len,#0x40 // because .Lecb_enc_tail4x -+ sub $len,$len,#0x50 -+ -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo -+ mov $key_,$key -+ -+ aese $dat0,q10 -+ aesmc $dat0,$dat0 -+ aese $dat1,q10 -+ aesmc $dat1,$dat1 -+ aese $dat2,q10 -+ aesmc $dat2,$dat2 -+ aese $dat3,q10 -+ aesmc $dat3,$dat3 -+ aese $dat4,q10 -+ aesmc $dat4,$dat4 -+ add $inp,$inp,x6 // $inp is adjusted in such way that -+ // at exit from the loop $dat1-$dat4 -+ // are loaded with last "words" -+ add x6,$len,#0x60 // because .Lecb_enc_tail4x -+ -+ aese $dat0,q11 -+ aesmc $dat0,$dat0 -+ aese $dat1,q11 -+ aesmc $dat1,$dat1 -+ aese $dat2,q11 -+ aesmc $dat2,$dat2 -+ aese $dat3,q11 -+ aesmc $dat3,$dat3 -+ aese $dat4,q11 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ aese $dat3,q12 -+ aesmc $dat3,$dat3 -+ aese $dat4,q12 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ aese $dat3,q13 -+ aesmc $dat3,$dat3 -+ aese $dat4,q13 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ aese $dat3,q14 -+ aesmc $dat3,$dat3 -+ aese $dat4,q14 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q15 -+ vld1.8 {$in0},[$inp],#16 -+ aese $dat1,q15 -+ vld1.8 {$in1},[$inp],#16 -+ aese $dat2,q15 -+ vld1.8 {$in2},[$inp],#16 -+ aese $dat3,q15 -+ vld1.8 {$in3},[$inp],#16 -+ aese $dat4,q15 -+ vld1.8 {$in4},[$inp],#16 -+ cbz x6,.Lecb_enc_tail4x -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$rndlast,$dat0 -+ vorr $dat0,$in0,$in0 -+ veor $tmp1,$rndlast,$dat1 -+ vorr $dat1,$in1,$in1 -+ veor $tmp2,$rndlast,$dat2 -+ vorr $dat2,$in2,$in2 -+ veor $tmp3,$rndlast,$dat3 -+ vorr $dat3,$in3,$in3 -+ veor $tmp4,$rndlast,$dat4 -+ vst1.8 {$tmp0},[$out],#16 -+ vorr $dat4,$in4,$in4 -+ vst1.8 {$tmp1},[$out],#16 -+ mov $cnt,$rounds -+ vst1.8 {$tmp2},[$out],#16 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ b.hs .Loop5x_ecb_enc -+ -+ add $len,$len,#0x50 -+ cbz $len,.Lecb_done -+ -+ add $cnt,$rounds,#2 -+ subs $len,$len,#0x30 -+ vorr $dat0,$in2,$in2 -+ vorr $dat1,$in3,$in3 -+ vorr $dat2,$in4,$in4 -+ b.lo .Lecb_enc_tail -+ -+ b .Loop3x_ecb_enc -+ -+.align 4 -+.Lecb_enc_tail4x: -+ veor $tmp1,$rndlast,$dat1 -+ veor $tmp2,$rndlast,$dat2 -+ veor $tmp3,$rndlast,$dat3 -+ veor $tmp4,$rndlast,$dat4 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ -+ b .Lecb_done -+.align 4 -+___ -+$code.=<<___; -+.Loop3x_ecb_enc: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop3x_ecb_enc -+ -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ subs $len,$len,#0x30 -+ mov.lo x6,$len // x6, $cnt, is zero at this point -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ add $inp,$inp,x6 // $inp is adjusted in such way that -+ // at exit from the loop $dat1-$dat2 -+ // are loaded with last "words" -+ mov $key_,$key -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ vld1.8 {$in0},[$inp],#16 -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ vld1.8 {$in1},[$inp],#16 -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ vld1.8 {$in2},[$inp],#16 -+ aese $dat0,q15 -+ aese $dat1,q15 -+ aese $dat2,q15 -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ add $cnt,$rounds,#2 -+ veor $tmp0,$rndlast,$dat0 -+ veor $tmp1,$rndlast,$dat1 -+ veor $dat2,$dat2,$rndlast -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp0},[$out],#16 -+ vorr $dat0,$in0,$in0 -+ vst1.8 {$tmp1},[$out],#16 -+ vorr $dat1,$in1,$in1 -+ vst1.8 {$dat2},[$out],#16 -+ vorr $dat2,$in2,$in2 -+ b.hs .Loop3x_ecb_enc -+ -+ cmn $len,#0x30 -+ b.eq .Lecb_done -+ nop -+ -+.Lecb_enc_tail: -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lecb_enc_tail -+ -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ cmn $len,#0x20 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ aese $dat1,q15 -+ aese $dat2,q15 -+ b.eq .Lecb_enc_one -+ veor $tmp1,$rndlast,$dat1 -+ veor $tmp2,$rndlast,$dat2 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ b .Lecb_done -+ -+.Lecb_enc_one: -+ veor $tmp1,$rndlast,$dat2 -+ vst1.8 {$tmp1},[$out],#16 -+ b .Lecb_done -+___ -+ -+$code.=<<___; -+.align 5 -+.Lecb_dec: -+ vld1.8 {$dat1},[$inp],#16 -+ subs $len,$len,#32 // bias -+ add $cnt,$rounds,#2 -+ vorr $in1,$dat1,$dat1 -+ vorr $dat2,$dat1,$dat1 -+ vorr $dat1,$dat,$dat -+ b.lo .Lecb_dec_tail -+ -+ vorr $dat1,$in1,$in1 -+ vld1.8 {$dat2},[$inp],#16 -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#32 -+ b.lo .Loop3x_ecb_dec -+ -+ vld1.8 {$dat3},[$inp],#16 -+ vld1.8 {$dat4},[$inp],#16 -+ sub $len,$len,#32 // bias -+ mov $cnt,$rounds -+ -+.Loop5x_ecb_dec: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop5x_ecb_dec -+ -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ cmp $len,#0x40 // because .Lecb_tail4x -+ sub $len,$len,#0x50 -+ -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo -+ mov $key_,$key -+ -+ aesd $dat0,q10 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q10 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q10 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q10 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q10 -+ aesimc $dat4,$dat4 -+ add $inp,$inp,x6 // $inp is adjusted in such way that -+ // at exit from the loop $dat1-$dat4 -+ // are loaded with last "words" -+ add x6,$len,#0x60 // because .Lecb_tail4x -+ -+ aesd $dat0,q11 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q11 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q11 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q11 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q11 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q12 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q12 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q13 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q13 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q14 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q14 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q15 -+ vld1.8 {$in0},[$inp],#16 -+ aesd $dat1,q15 -+ vld1.8 {$in1},[$inp],#16 -+ aesd $dat2,q15 -+ vld1.8 {$in2},[$inp],#16 -+ aesd $dat3,q15 -+ vld1.8 {$in3},[$inp],#16 -+ aesd $dat4,q15 -+ vld1.8 {$in4},[$inp],#16 -+ cbz x6,.Lecb_tail4x -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$rndlast,$dat0 -+ vorr $dat0,$in0,$in0 -+ veor $tmp1,$rndlast,$dat1 -+ vorr $dat1,$in1,$in1 -+ veor $tmp2,$rndlast,$dat2 -+ vorr $dat2,$in2,$in2 -+ veor $tmp3,$rndlast,$dat3 -+ vorr $dat3,$in3,$in3 -+ veor $tmp4,$rndlast,$dat4 -+ vst1.8 {$tmp0},[$out],#16 -+ vorr $dat4,$in4,$in4 -+ vst1.8 {$tmp1},[$out],#16 -+ mov $cnt,$rounds -+ vst1.8 {$tmp2},[$out],#16 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ b.hs .Loop5x_ecb_dec -+ -+ add $len,$len,#0x50 -+ cbz $len,.Lecb_done -+ -+ add $cnt,$rounds,#2 -+ subs $len,$len,#0x30 -+ vorr $dat0,$in2,$in2 -+ vorr $dat1,$in3,$in3 -+ vorr $dat2,$in4,$in4 -+ b.lo .Lecb_dec_tail -+ -+ b .Loop3x_ecb_dec -+ -+.align 4 -+.Lecb_tail4x: -+ veor $tmp1,$rndlast,$dat1 -+ veor $tmp2,$rndlast,$dat2 -+ veor $tmp3,$rndlast,$dat3 -+ veor $tmp4,$rndlast,$dat4 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ -+ b .Lecb_done -+.align 4 -+___ -+$code.=<<___; -+.Loop3x_ecb_dec: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop3x_ecb_dec -+ -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ subs $len,$len,#0x30 -+ mov.lo x6,$len // x6, $cnt, is zero at this point -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ add $inp,$inp,x6 // $inp is adjusted in such way that -+ // at exit from the loop $dat1-$dat2 -+ // are loaded with last "words" -+ mov $key_,$key -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ vld1.8 {$in0},[$inp],#16 -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ vld1.8 {$in1},[$inp],#16 -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ vld1.8 {$in2},[$inp],#16 -+ aesd $dat0,q15 -+ aesd $dat1,q15 -+ aesd $dat2,q15 -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ add $cnt,$rounds,#2 -+ veor $tmp0,$rndlast,$dat0 -+ veor $tmp1,$rndlast,$dat1 -+ veor $dat2,$dat2,$rndlast -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp0},[$out],#16 -+ vorr $dat0,$in0,$in0 -+ vst1.8 {$tmp1},[$out],#16 -+ vorr $dat1,$in1,$in1 -+ vst1.8 {$dat2},[$out],#16 -+ vorr $dat2,$in2,$in2 -+ b.hs .Loop3x_ecb_dec -+ -+ cmn $len,#0x30 -+ b.eq .Lecb_done -+ nop -+ -+.Lecb_dec_tail: -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lecb_dec_tail -+ -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ cmn $len,#0x20 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q15 -+ aesd $dat2,q15 -+ b.eq .Lecb_dec_one -+ veor $tmp1,$rndlast,$dat1 -+ veor $tmp2,$rndlast,$dat2 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ b .Lecb_done -+ -+.Lecb_dec_one: -+ veor $tmp1,$rndlast,$dat2 -+ vst1.8 {$tmp1},[$out],#16 -+ -+.Lecb_done: -+___ -+} -+$code.=<<___ if ($flavour !~ /64/); -+ vldmia sp!,{d8-d15} -+ ldmia sp!,{r4-r8,pc} -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ ldr x29,[sp],#16 -+___ -+$code.=<<___ if ($flavour =~ /64/); -+.Lecb_Final_abort: -+ ret -+___ -+$code.=<<___; -+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt -+___ -+}}} - {{{ - my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; - my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); -@@ -519,6 +1377,13 @@ $code.=<<___; - ___ - { - my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); -+ -+my ($dat3,$in3,$tmp3); # used only in 64-bit mode -+my ($dat4,$in4,$tmp4); -+if ($flavour =~ /64/) { -+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); -+} -+ - $code.=<<___; - .align 5 - .Lcbc_dec: -@@ -535,7 +1400,196 @@ $code.=<<___; - vorr $in0,$dat,$dat - vorr $in1,$dat1,$dat1 - vorr $in2,$dat2,$dat2 -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#32 -+ b.lo .Loop3x_cbc_dec -+ -+ vld1.8 {$dat3},[$inp],#16 -+ vld1.8 {$dat4},[$inp],#16 -+ sub $len,$len,#32 // bias -+ mov $cnt,$rounds -+ vorr $in3,$dat3,$dat3 -+ vorr $in4,$dat4,$dat4 -+ -+.Loop5x_cbc_dec: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop5x_cbc_dec -+ -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ cmp $len,#0x40 // because .Lcbc_tail4x -+ sub $len,$len,#0x50 -+ -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo -+ mov $key_,$key -+ -+ aesd $dat0,q10 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q10 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q10 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q10 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q10 -+ aesimc $dat4,$dat4 -+ add $inp,$inp,x6 // $inp is adjusted in such way that -+ // at exit from the loop $dat1-$dat4 -+ // are loaded with last "words" -+ add x6,$len,#0x60 // because .Lcbc_tail4x -+ -+ aesd $dat0,q11 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q11 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q11 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q11 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q11 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q12 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q12 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q13 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q13 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q14 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q14 -+ aesimc $dat4,$dat4 - -+ veor $tmp0,$ivec,$rndlast -+ aesd $dat0,q15 -+ veor $tmp1,$in0,$rndlast -+ vld1.8 {$in0},[$inp],#16 -+ aesd $dat1,q15 -+ veor $tmp2,$in1,$rndlast -+ vld1.8 {$in1},[$inp],#16 -+ aesd $dat2,q15 -+ veor $tmp3,$in2,$rndlast -+ vld1.8 {$in2},[$inp],#16 -+ aesd $dat3,q15 -+ veor $tmp4,$in3,$rndlast -+ vld1.8 {$in3},[$inp],#16 -+ aesd $dat4,q15 -+ vorr $ivec,$in4,$in4 -+ vld1.8 {$in4},[$inp],#16 -+ cbz x6,.Lcbc_tail4x -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$tmp0,$dat0 -+ vorr $dat0,$in0,$in0 -+ veor $tmp1,$tmp1,$dat1 -+ vorr $dat1,$in1,$in1 -+ veor $tmp2,$tmp2,$dat2 -+ vorr $dat2,$in2,$in2 -+ veor $tmp3,$tmp3,$dat3 -+ vorr $dat3,$in3,$in3 -+ veor $tmp4,$tmp4,$dat4 -+ vst1.8 {$tmp0},[$out],#16 -+ vorr $dat4,$in4,$in4 -+ vst1.8 {$tmp1},[$out],#16 -+ mov $cnt,$rounds -+ vst1.8 {$tmp2},[$out],#16 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ b.hs .Loop5x_cbc_dec -+ -+ add $len,$len,#0x50 -+ cbz $len,.Lcbc_done -+ -+ add $cnt,$rounds,#2 -+ subs $len,$len,#0x30 -+ vorr $dat0,$in2,$in2 -+ vorr $in0,$in2,$in2 -+ vorr $dat1,$in3,$in3 -+ vorr $in1,$in3,$in3 -+ vorr $dat2,$in4,$in4 -+ vorr $in2,$in4,$in4 -+ b.lo .Lcbc_dec_tail -+ -+ b .Loop3x_cbc_dec -+ -+.align 4 -+.Lcbc_tail4x: -+ veor $tmp1,$tmp0,$dat1 -+ veor $tmp2,$tmp2,$dat2 -+ veor $tmp3,$tmp3,$dat3 -+ veor $tmp4,$tmp4,$dat4 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ -+ b .Lcbc_done -+.align 4 -+___ -+$code.=<<___; - .Loop3x_cbc_dec: - aesd $dat0,q8 - aesimc $dat0,$dat0 -@@ -696,6 +1750,9 @@ my $step="x12"; # aliases with $tctr2 - my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); - my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); - -+# used only in 64-bit mode... -+my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); -+ - my ($dat,$tmp)=($dat0,$tmp0); - - ### q8-q15 preloaded key schedule -@@ -751,6 +1808,175 @@ $code.=<<___; - vmov.32 ${ivec}[3],$tctr2 - sub $len,$len,#3 // bias - vorr $dat2,$ivec,$ivec -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#2 -+ b.lo .Loop3x_ctr32 -+ -+ add w13,$ctr,#1 -+ add w14,$ctr,#2 -+ vorr $dat3,$dat0,$dat0 -+ rev w13,w13 -+ vorr $dat4,$dat0,$dat0 -+ rev w14,w14 -+ vmov.32 ${dat3}[3],w13 -+ sub $len,$len,#2 // bias -+ vmov.32 ${dat4}[3],w14 -+ add $ctr,$ctr,#2 -+ b .Loop5x_ctr32 -+ -+.align 4 -+.Loop5x_ctr32: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 -+ subs $cnt,$cnt,#2 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop5x_ctr32 -+ -+ mov $key_,$key -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ add $tctr0,$ctr,#1 -+ add $tctr1,$ctr,#2 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ add $tctr2,$ctr,#3 -+ add w13,$ctr,#4 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ add w14,$ctr,#5 -+ rev $tctr0,$tctr0 -+ aese $dat3,q12 -+ aesmc $dat3,$dat3 -+ rev $tctr1,$tctr1 -+ rev $tctr2,$tctr2 -+ aese $dat4,q12 -+ aesmc $dat4,$dat4 -+ rev w13,w13 -+ rev w14,w14 -+ -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ aese $dat3,q13 -+ aesmc $dat3,$dat3 -+ aese $dat4,q13 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ vld1.8 {$in0},[$inp],#16 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ vld1.8 {$in1},[$inp],#16 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ vld1.8 {$in2},[$inp],#16 -+ aese $dat3,q14 -+ aesmc $dat3,$dat3 -+ vld1.8 {$in3},[$inp],#16 -+ aese $dat4,q14 -+ aesmc $dat4,$dat4 -+ vld1.8 {$in4},[$inp],#16 -+ -+ aese $dat0,q15 -+ veor $in0,$in0,$rndlast -+ aese $dat1,q15 -+ veor $in1,$in1,$rndlast -+ aese $dat2,q15 -+ veor $in2,$in2,$rndlast -+ aese $dat3,q15 -+ veor $in3,$in3,$rndlast -+ aese $dat4,q15 -+ veor $in4,$in4,$rndlast -+ -+ veor $in0,$in0,$dat0 -+ vorr $dat0,$ivec,$ivec -+ veor $in1,$in1,$dat1 -+ vorr $dat1,$ivec,$ivec -+ veor $in2,$in2,$dat2 -+ vorr $dat2,$ivec,$ivec -+ veor $in3,$in3,$dat3 -+ vorr $dat3,$ivec,$ivec -+ veor $in4,$in4,$dat4 -+ vorr $dat4,$ivec,$ivec -+ -+ vst1.8 {$in0},[$out],#16 -+ vmov.32 ${dat0}[3],$tctr0 -+ vst1.8 {$in1},[$out],#16 -+ vmov.32 ${dat1}[3],$tctr1 -+ vst1.8 {$in2},[$out],#16 -+ vmov.32 ${dat2}[3],$tctr2 -+ vst1.8 {$in3},[$out],#16 -+ vmov.32 ${dat3}[3],w13 -+ vst1.8 {$in4},[$out],#16 -+ vmov.32 ${dat4}[3],w14 -+ -+ mov $cnt,$rounds -+ cbz $len,.Lctr32_done -+ -+ add $ctr,$ctr,#5 -+ subs $len,$len,#5 -+ b.hs .Loop5x_ctr32 -+ -+ add $len,$len,#5 -+ sub $ctr,$ctr,#5 -+ -+ cmp $len,#2 -+ mov $step,#16 -+ cclr $step,lo -+ b.ls .Lctr32_tail -+ -+ sub $len,$len,#3 // bias -+ add $ctr,$ctr,#3 -+___ -+$code.=<<___; - b .Loop3x_ctr32 - - .align 4 -@@ -905,6 +2131,1432 @@ $code.=<<___; - .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks - ___ - }}} -+# Performance in cycles per byte. -+# Processed with AES-XTS different key size. -+# It shows the value before and after optimization as below: -+# (before/after): -+# -+# AES-128-XTS AES-256-XTS -+# Cortex-A57 3.36/1.09 4.02/1.37 -+# Cortex-A72 3.03/1.02 3.28/1.33 -+ -+# Optimization is implemented by loop unrolling and interleaving. -+# Commonly, we choose the unrolling factor as 5, if the input -+# data size smaller than 5 blocks, but not smaller than 3 blocks, -+# choose 3 as the unrolling factor. -+# If the input data size dsize >= 5*16 bytes, then take 5 blocks -+# as one iteration, every loop the left size lsize -= 5*16. -+# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes -+# will be processed specially, which be integrated into the 5*16 bytes -+# loop to improve the efficiency. -+# There is one special case, if the original input data size dsize -+# = 16 bytes, we will treat it seperately to improve the -+# performance: one independent code block without LR, FP load and -+# store. -+# Encryption will process the (length -tailcnt) bytes as mentioned -+# previously, then encrypt the composite block as last second -+# cipher block. -+# Decryption will process the (length -tailcnt -1) bytes as mentioned -+# previously, then decrypt the last second cipher block to get the -+# last plain block(tail), decrypt the composite block as last second -+# plain text block. -+ -+{{{ -+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); -+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); -+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); -+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); -+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); -+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); -+my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); -+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); -+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); -+ -+my ($tmpin)=("v26.16b"); -+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); -+ -+# q7 last round key -+# q10-q15, q7 Last 7 round keys -+# q8-q9 preloaded round keys except last 7 keys for big size -+# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte -+ -+ -+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); -+ -+my ($dat3,$in3,$tmp3); # used only in 64-bit mode -+my ($dat4,$in4,$tmp4); -+if ($flavour =~ /64/) { -+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); -+} -+ -+$code.=<<___ if ($flavour =~ /64/); -+.globl ${prefix}_xts_encrypt -+.type ${prefix}_xts_encrypt,%function -+.align 5 -+${prefix}_xts_encrypt: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#16 -+ // Original input data size bigger than 16, jump to big size processing. -+ b.ne .Lxts_enc_big_size -+ // Encrypt the iv with key2, as the first XEX iv. -+ ldr $rounds,[$key2,#240] -+ vld1.8 {$dat},[$key2],#16 -+ vld1.8 {$iv0},[$ivp] -+ sub $rounds,$rounds,#2 -+ vld1.8 {$dat1},[$key2],#16 -+ -+.Loop_enc_iv_enc: -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2],#16 -+ subs $rounds,$rounds,#2 -+ aese $iv0,$dat1 -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat1},[$key2],#16 -+ b.gt .Loop_enc_iv_enc -+ -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2] -+ aese $iv0,$dat1 -+ veor $iv0,$iv0,$dat -+ -+ vld1.8 {$dat0},[$inp] -+ veor $dat0,$iv0,$dat0 -+ -+ ldr $rounds,[$key1,#240] -+ vld1.32 {q20-q21},[$key1],#32 // load key schedule... -+ -+ aese $dat0,q20 -+ aesmc $dat0,$dat0 -+ vld1.32 {q8-q9},[$key1],#32 // load key schedule... -+ aese $dat0,q21 -+ aesmc $dat0,$dat0 -+ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing -+ b.eq .Lxts_128_enc -+.Lxts_enc_round_loop: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ vld1.32 {q8},[$key1],#16 // load key schedule... -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ vld1.32 {q9},[$key1],#16 // load key schedule... -+ subs $rounds,$rounds,#2 // bias -+ b.gt .Lxts_enc_round_loop -+.Lxts_128_enc: -+ vld1.32 {q10-q11},[$key1],#32 // load key schedule... -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ vld1.32 {q12-q13},[$key1],#32 // load key schedule... -+ aese $dat0,q10 -+ aesmc $dat0,$dat0 -+ aese $dat0,q11 -+ aesmc $dat0,$dat0 -+ vld1.32 {q14-q15},[$key1],#32 // load key schedule... -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ vld1.32 {$rndlast},[$key1] -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat0,q15 -+ veor $dat0,$dat0,$rndlast -+ veor $dat0,$dat0,$iv0 -+ vst1.8 {$dat0},[$out] -+ b .Lxts_enc_final_abort -+ -+.align 4 -+.Lxts_enc_big_size: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ stp $constnumx,$tmpinp,[sp,#-64]! -+ stp $tailcnt,$midnumx,[sp,#48] -+ stp $ivd10,$ivd20,[sp,#32] -+ stp $ivd30,$ivd40,[sp,#16] -+ -+ // tailcnt store the tail value of length%16. -+ and $tailcnt,$len,#0xf -+ and $len,$len,#-16 -+ subs $len,$len,#16 -+ mov $step,#16 -+ b.lo .Lxts_abort -+ csel $step,xzr,$step,eq -+ -+ // Firstly, encrypt the iv with key2, as the first iv of XEX. -+ ldr $rounds,[$key2,#240] -+ vld1.32 {$dat},[$key2],#16 -+ vld1.8 {$iv0},[$ivp] -+ sub $rounds,$rounds,#2 -+ vld1.32 {$dat1},[$key2],#16 -+ -+.Loop_iv_enc: -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2],#16 -+ subs $rounds,$rounds,#2 -+ aese $iv0,$dat1 -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat1},[$key2],#16 -+ b.gt .Loop_iv_enc -+ -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2] -+ aese $iv0,$dat1 -+ veor $iv0,$iv0,$dat -+ -+ // The iv for second block -+ // $ivl- iv(low), $ivh - iv(high) -+ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 -+ fmov $ivl,$ivd00 -+ fmov $ivh,$ivd01 -+ mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd10,$ivl -+ fmov $ivd11,$ivh -+ -+ ldr $rounds0,[$key1,#240] // next starting point -+ vld1.8 {$dat},[$inp],$step -+ -+ vld1.32 {q8-q9},[$key1] // load key schedule... -+ sub $rounds0,$rounds0,#6 -+ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys -+ sub $rounds0,$rounds0,#2 -+ vld1.32 {q10-q11},[$key_],#32 -+ vld1.32 {q12-q13},[$key_],#32 -+ vld1.32 {q14-q15},[$key_],#32 -+ vld1.32 {$rndlast},[$key_] -+ -+ add $key_,$key1,#32 -+ mov $rounds,$rounds0 -+ -+ // Encryption -+.Lxts_enc: -+ vld1.8 {$dat2},[$inp],#16 -+ subs $len,$len,#32 // bias -+ add $rounds,$rounds0,#2 -+ vorr $in1,$dat,$dat -+ vorr $dat1,$dat,$dat -+ vorr $in3,$dat,$dat -+ vorr $in2,$dat2,$dat2 -+ vorr $in4,$dat2,$dat2 -+ b.lo .Lxts_inner_enc_tail -+ veor $dat,$dat,$iv0 // before encryption, xor with iv -+ veor $dat2,$dat2,$iv1 -+ -+ // The iv for third block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd20,$ivl -+ fmov $ivd21,$ivh -+ -+ -+ vorr $dat1,$dat2,$dat2 -+ vld1.8 {$dat2},[$inp],#16 -+ vorr $in0,$dat,$dat -+ vorr $in1,$dat1,$dat1 -+ veor $in2,$dat2,$iv2 // the third block -+ veor $dat2,$dat2,$iv2 -+ cmp $len,#32 -+ b.lo .Lxts_outer_enc_tail -+ -+ // The iv for fourth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd30,$ivl -+ fmov $ivd31,$ivh -+ -+ vld1.8 {$dat3},[$inp],#16 -+ // The iv for fifth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd40,$ivl -+ fmov $ivd41,$ivh -+ -+ vld1.8 {$dat4},[$inp],#16 -+ veor $dat3,$dat3,$iv3 // the fourth block -+ veor $dat4,$dat4,$iv4 -+ sub $len,$len,#32 // bias -+ mov $rounds,$rounds0 -+ b .Loop5x_xts_enc -+ -+.align 4 -+.Loop5x_xts_enc: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 -+ subs $rounds,$rounds,#2 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Loop5x_xts_enc -+ -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat3,q8 -+ aesmc $dat3,$dat3 -+ aese $dat4,q8 -+ aesmc $dat4,$dat4 -+ subs $len,$len,#0x50 // because .Lxts_enc_tail4x -+ -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat3,q9 -+ aesmc $dat3,$dat3 -+ aese $dat4,q9 -+ aesmc $dat4,$dat4 -+ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo -+ mov $key_,$key1 -+ -+ aese $dat0,q10 -+ aesmc $dat0,$dat0 -+ aese $dat1,q10 -+ aesmc $dat1,$dat1 -+ aese $dat2,q10 -+ aesmc $dat2,$dat2 -+ aese $dat3,q10 -+ aesmc $dat3,$dat3 -+ aese $dat4,q10 -+ aesmc $dat4,$dat4 -+ add $inp,$inp,$xoffset // x0 is adjusted in such way that -+ // at exit from the loop v1.16b-v26.16b -+ // are loaded with last "words" -+ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x -+ -+ aese $dat0,q11 -+ aesmc $dat0,$dat0 -+ aese $dat1,q11 -+ aesmc $dat1,$dat1 -+ aese $dat2,q11 -+ aesmc $dat2,$dat2 -+ aese $dat3,q11 -+ aesmc $dat3,$dat3 -+ aese $dat4,q11 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ aese $dat3,q12 -+ aesmc $dat3,$dat3 -+ aese $dat4,q12 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ aese $dat3,q13 -+ aesmc $dat3,$dat3 -+ aese $dat4,q13 -+ aesmc $dat4,$dat4 -+ -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ aese $dat3,q14 -+ aesmc $dat3,$dat3 -+ aese $dat4,q14 -+ aesmc $dat4,$dat4 -+ -+ veor $tmp0,$rndlast,$iv0 -+ aese $dat0,q15 -+ // The iv for first block of one iteration -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ veor $tmp1,$rndlast,$iv1 -+ vld1.8 {$in0},[$inp],#16 -+ aese $dat1,q15 -+ // The iv for second block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd10,$ivl -+ fmov $ivd11,$ivh -+ veor $tmp2,$rndlast,$iv2 -+ vld1.8 {$in1},[$inp],#16 -+ aese $dat2,q15 -+ // The iv for third block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd20,$ivl -+ fmov $ivd21,$ivh -+ veor $tmp3,$rndlast,$iv3 -+ vld1.8 {$in2},[$inp],#16 -+ aese $dat3,q15 -+ // The iv for fourth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd30,$ivl -+ fmov $ivd31,$ivh -+ veor $tmp4,$rndlast,$iv4 -+ vld1.8 {$in3},[$inp],#16 -+ aese $dat4,q15 -+ -+ // The iv for fifth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd40,$ivl -+ fmov $ivd41,$ivh -+ -+ vld1.8 {$in4},[$inp],#16 -+ cbz $xoffset,.Lxts_enc_tail4x -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$tmp0,$dat0 -+ veor $dat0,$in0,$iv0 -+ veor $tmp1,$tmp1,$dat1 -+ veor $dat1,$in1,$iv1 -+ veor $tmp2,$tmp2,$dat2 -+ veor $dat2,$in2,$iv2 -+ veor $tmp3,$tmp3,$dat3 -+ veor $dat3,$in3,$iv3 -+ veor $tmp4,$tmp4,$dat4 -+ vst1.8 {$tmp0},[$out],#16 -+ veor $dat4,$in4,$iv4 -+ vst1.8 {$tmp1},[$out],#16 -+ mov $rounds,$rounds0 -+ vst1.8 {$tmp2},[$out],#16 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ b.hs .Loop5x_xts_enc -+ -+ -+ // If left 4 blocks, borrow the five block's processing. -+ cmn $len,#0x10 -+ b.ne .Loop5x_enc_after -+ vorr $iv4,$iv3,$iv3 -+ vorr $iv3,$iv2,$iv2 -+ vorr $iv2,$iv1,$iv1 -+ vorr $iv1,$iv0,$iv0 -+ fmov $ivl,$ivd40 -+ fmov $ivh,$ivd41 -+ veor $dat0,$iv0,$in0 -+ veor $dat1,$iv1,$in1 -+ veor $dat2,$in2,$iv2 -+ veor $dat3,$in3,$iv3 -+ veor $dat4,$in4,$iv4 -+ b.eq .Loop5x_xts_enc -+ -+.Loop5x_enc_after: -+ add $len,$len,#0x50 -+ cbz $len,.Lxts_enc_done -+ -+ add $rounds,$rounds0,#2 -+ subs $len,$len,#0x30 -+ b.lo .Lxts_inner_enc_tail -+ -+ veor $dat0,$iv0,$in2 -+ veor $dat1,$iv1,$in3 -+ veor $dat2,$in4,$iv2 -+ b .Lxts_outer_enc_tail -+ -+.align 4 -+.Lxts_enc_tail4x: -+ add $inp,$inp,#16 -+ veor $tmp1,$dat1,$tmp1 -+ vst1.8 {$tmp1},[$out],#16 -+ veor $tmp2,$dat2,$tmp2 -+ vst1.8 {$tmp2},[$out],#16 -+ veor $tmp3,$dat3,$tmp3 -+ veor $tmp4,$dat4,$tmp4 -+ vst1.8 {$tmp3-$tmp4},[$out],#32 -+ -+ b .Lxts_enc_done -+.align 4 -+.Lxts_outer_enc_tail: -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $rounds,$rounds,#2 -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lxts_outer_enc_tail -+ -+ aese $dat0,q8 -+ aesmc $dat0,$dat0 -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ veor $tmp0,$iv0,$rndlast -+ subs $len,$len,#0x30 -+ // The iv for first block -+ fmov $ivl,$ivd20 -+ fmov $ivh,$ivd21 -+ //mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr#31 -+ eor $ivl,$tmpmx,$ivl,lsl#1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ veor $tmp1,$iv1,$rndlast -+ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point -+ aese $dat0,q9 -+ aesmc $dat0,$dat0 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ veor $tmp2,$iv2,$rndlast -+ -+ add $xoffset,$xoffset,#0x20 -+ add $inp,$inp,$xoffset -+ mov $key_,$key1 -+ -+ aese $dat0,q12 -+ aesmc $dat0,$dat0 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ aese $dat0,q13 -+ aesmc $dat0,$dat0 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ aese $dat0,q14 -+ aesmc $dat0,$dat0 -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ aese $dat0,q15 -+ aese $dat1,q15 -+ aese $dat2,q15 -+ vld1.8 {$in2},[$inp],#16 -+ add $rounds,$rounds0,#2 -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$tmp0,$dat0 -+ veor $tmp1,$tmp1,$dat1 -+ veor $dat2,$dat2,$tmp2 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp0},[$out],#16 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$dat2},[$out],#16 -+ cmn $len,#0x30 -+ b.eq .Lxts_enc_done -+.Lxts_encxor_one: -+ vorr $in3,$in1,$in1 -+ vorr $in4,$in2,$in2 -+ nop -+ -+.Lxts_inner_enc_tail: -+ cmn $len,#0x10 -+ veor $dat1,$in3,$iv0 -+ veor $dat2,$in4,$iv1 -+ b.eq .Lxts_enc_tail_loop -+ veor $dat2,$in4,$iv0 -+.Lxts_enc_tail_loop: -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $rounds,$rounds,#2 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lxts_enc_tail_loop -+ -+ aese $dat1,q8 -+ aesmc $dat1,$dat1 -+ aese $dat2,q8 -+ aesmc $dat2,$dat2 -+ aese $dat1,q9 -+ aesmc $dat1,$dat1 -+ aese $dat2,q9 -+ aesmc $dat2,$dat2 -+ aese $dat1,q12 -+ aesmc $dat1,$dat1 -+ aese $dat2,q12 -+ aesmc $dat2,$dat2 -+ cmn $len,#0x20 -+ aese $dat1,q13 -+ aesmc $dat1,$dat1 -+ aese $dat2,q13 -+ aesmc $dat2,$dat2 -+ veor $tmp1,$iv0,$rndlast -+ aese $dat1,q14 -+ aesmc $dat1,$dat1 -+ aese $dat2,q14 -+ aesmc $dat2,$dat2 -+ veor $tmp2,$iv1,$rndlast -+ aese $dat1,q15 -+ aese $dat2,q15 -+ b.eq .Lxts_enc_one -+ veor $tmp1,$tmp1,$dat1 -+ vst1.8 {$tmp1},[$out],#16 -+ veor $tmp2,$tmp2,$dat2 -+ vorr $iv0,$iv1,$iv1 -+ vst1.8 {$tmp2},[$out],#16 -+ fmov $ivl,$ivd10 -+ fmov $ivh,$ivd11 -+ mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ b .Lxts_enc_done -+ -+.Lxts_enc_one: -+ veor $tmp1,$tmp1,$dat2 -+ vorr $iv0,$iv0,$iv0 -+ vst1.8 {$tmp1},[$out],#16 -+ fmov $ivl,$ivd00 -+ fmov $ivh,$ivd01 -+ mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ b .Lxts_enc_done -+.align 5 -+.Lxts_enc_done: -+ // Process the tail block with cipher stealing. -+ tst $tailcnt,#0xf -+ b.eq .Lxts_abort -+ -+ mov $tmpinp,$inp -+ mov $tmpoutp,$out -+ sub $out,$out,#16 -+.composite_enc_loop: -+ subs $tailcnt,$tailcnt,#1 -+ ldrb $l2outp,[$out,$tailcnt] -+ ldrb $loutp,[$tmpinp,$tailcnt] -+ strb $l2outp,[$tmpoutp,$tailcnt] -+ strb $loutp,[$out,$tailcnt] -+ b.gt .composite_enc_loop -+.Lxts_enc_load_done: -+ vld1.8 {$tmpin},[$out] -+ veor $tmpin,$tmpin,$iv0 -+ -+ // Encrypt the composite block to get the last second encrypted text block -+ ldr $rounds,[$key1,#240] // load key schedule... -+ vld1.8 {$dat},[$key1],#16 -+ sub $rounds,$rounds,#2 -+ vld1.8 {$dat1},[$key1],#16 // load key schedule... -+.Loop_final_enc: -+ aese $tmpin,$dat0 -+ aesmc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key1],#16 -+ subs $rounds,$rounds,#2 -+ aese $tmpin,$dat1 -+ aesmc $tmpin,$tmpin -+ vld1.32 {$dat1},[$key1],#16 -+ b.gt .Loop_final_enc -+ -+ aese $tmpin,$dat0 -+ aesmc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key1] -+ aese $tmpin,$dat1 -+ veor $tmpin,$tmpin,$dat0 -+ veor $tmpin,$tmpin,$iv0 -+ vst1.8 {$tmpin},[$out] -+ -+.Lxts_abort: -+ ldp $tailcnt,$midnumx,[sp,#48] -+ ldp $ivd10,$ivd20,[sp,#32] -+ ldp $ivd30,$ivd40,[sp,#16] -+ ldp $constnumx,$tmpinp,[sp],#64 -+.Lxts_enc_final_abort: -+ ret -+.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt -+___ -+ -+}}} -+{{{ -+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); -+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); -+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); -+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); -+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); -+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); -+my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); -+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); -+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); -+ -+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); -+ -+# q7 last round key -+# q10-q15, q7 Last 7 round keys -+# q8-q9 preloaded round keys except last 7 keys for big size -+# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte -+ -+{ -+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); -+ -+my ($dat3,$in3,$tmp3); # used only in 64-bit mode -+my ($dat4,$in4,$tmp4); -+if ($flavour =~ /64/) { -+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); -+} -+ -+$code.=<<___ if ($flavour =~ /64/); -+.globl ${prefix}_xts_decrypt -+.type ${prefix}_xts_decrypt,%function -+.align 5 -+${prefix}_xts_decrypt: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ cmp $len,#16 -+ // Original input data size bigger than 16, jump to big size processing. -+ b.ne .Lxts_dec_big_size -+ // Encrypt the iv with key2, as the first XEX iv. -+ ldr $rounds,[$key2,#240] -+ vld1.8 {$dat},[$key2],#16 -+ vld1.8 {$iv0},[$ivp] -+ sub $rounds,$rounds,#2 -+ vld1.8 {$dat1},[$key2],#16 -+ -+.Loop_dec_small_iv_enc: -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2],#16 -+ subs $rounds,$rounds,#2 -+ aese $iv0,$dat1 -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat1},[$key2],#16 -+ b.gt .Loop_dec_small_iv_enc -+ -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2] -+ aese $iv0,$dat1 -+ veor $iv0,$iv0,$dat -+ -+ vld1.8 {$dat0},[$inp] -+ veor $dat0,$iv0,$dat0 -+ -+ ldr $rounds,[$key1,#240] -+ vld1.32 {q20-q21},[$key1],#32 // load key schedule... -+ -+ aesd $dat0,q20 -+ aesimc $dat0,$dat0 -+ vld1.32 {q8-q9},[$key1],#32 // load key schedule... -+ aesd $dat0,q21 -+ aesimc $dat0,$dat0 -+ subs $rounds,$rounds,#10 // bias -+ b.eq .Lxts_128_dec -+.Lxts_dec_round_loop: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ vld1.32 {q8},[$key1],#16 // load key schedule... -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ vld1.32 {q9},[$key1],#16 // load key schedule... -+ subs $rounds,$rounds,#2 // bias -+ b.gt .Lxts_dec_round_loop -+.Lxts_128_dec: -+ vld1.32 {q10-q11},[$key1],#32 // load key schedule... -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ vld1.32 {q12-q13},[$key1],#32 // load key schedule... -+ aesd $dat0,q10 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q11 -+ aesimc $dat0,$dat0 -+ vld1.32 {q14-q15},[$key1],#32 // load key schedule... -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ vld1.32 {$rndlast},[$key1] -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat0,q15 -+ veor $dat0,$dat0,$rndlast -+ veor $dat0,$iv0,$dat0 -+ vst1.8 {$dat0},[$out] -+ b .Lxts_dec_final_abort -+.Lxts_dec_big_size: -+___ -+$code.=<<___ if ($flavour =~ /64/); -+ stp $constnumx,$tmpinp,[sp,#-64]! -+ stp $tailcnt,$midnumx,[sp,#48] -+ stp $ivd10,$ivd20,[sp,#32] -+ stp $ivd30,$ivd40,[sp,#16] -+ -+ and $tailcnt,$len,#0xf -+ and $len,$len,#-16 -+ subs $len,$len,#16 -+ mov $step,#16 -+ b.lo .Lxts_dec_abort -+ -+ // Encrypt the iv with key2, as the first XEX iv -+ ldr $rounds,[$key2,#240] -+ vld1.8 {$dat},[$key2],#16 -+ vld1.8 {$iv0},[$ivp] -+ sub $rounds,$rounds,#2 -+ vld1.8 {$dat1},[$key2],#16 -+ -+.Loop_dec_iv_enc: -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2],#16 -+ subs $rounds,$rounds,#2 -+ aese $iv0,$dat1 -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat1},[$key2],#16 -+ b.gt .Loop_dec_iv_enc -+ -+ aese $iv0,$dat -+ aesmc $iv0,$iv0 -+ vld1.32 {$dat},[$key2] -+ aese $iv0,$dat1 -+ veor $iv0,$iv0,$dat -+ -+ // The iv for second block -+ // $ivl- iv(low), $ivh - iv(high) -+ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 -+ fmov $ivl,$ivd00 -+ fmov $ivh,$ivd01 -+ mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd10,$ivl -+ fmov $ivd11,$ivh -+ -+ ldr $rounds0,[$key1,#240] // load rounds number -+ -+ // The iv for third block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd20,$ivl -+ fmov $ivd21,$ivh -+ -+ vld1.32 {q8-q9},[$key1] // load key schedule... -+ sub $rounds0,$rounds0,#6 -+ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys -+ sub $rounds0,$rounds0,#2 -+ vld1.32 {q10-q11},[$key_],#32 // load key schedule... -+ vld1.32 {q12-q13},[$key_],#32 -+ vld1.32 {q14-q15},[$key_],#32 -+ vld1.32 {$rndlast},[$key_] -+ -+ // The iv for fourth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd30,$ivl -+ fmov $ivd31,$ivh -+ -+ add $key_,$key1,#32 -+ mov $rounds,$rounds0 -+ b .Lxts_dec -+ -+ // Decryption -+.align 5 -+.Lxts_dec: -+ tst $tailcnt,#0xf -+ b.eq .Lxts_dec_begin -+ subs $len,$len,#16 -+ csel $step,xzr,$step,eq -+ vld1.8 {$dat},[$inp],#16 -+ b.lo .Lxts_done -+ sub $inp,$inp,#16 -+.Lxts_dec_begin: -+ vld1.8 {$dat},[$inp],$step -+ subs $len,$len,#32 // bias -+ add $rounds,$rounds0,#2 -+ vorr $in1,$dat,$dat -+ vorr $dat1,$dat,$dat -+ vorr $in3,$dat,$dat -+ vld1.8 {$dat2},[$inp],#16 -+ vorr $in2,$dat2,$dat2 -+ vorr $in4,$dat2,$dat2 -+ b.lo .Lxts_inner_dec_tail -+ veor $dat,$dat,$iv0 // before decryt, xor with iv -+ veor $dat2,$dat2,$iv1 -+ -+ vorr $dat1,$dat2,$dat2 -+ vld1.8 {$dat2},[$inp],#16 -+ vorr $in0,$dat,$dat -+ vorr $in1,$dat1,$dat1 -+ veor $in2,$dat2,$iv2 // third block xox with third iv -+ veor $dat2,$dat2,$iv2 -+ cmp $len,#32 -+ b.lo .Lxts_outer_dec_tail -+ -+ vld1.8 {$dat3},[$inp],#16 -+ -+ // The iv for fifth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd40,$ivl -+ fmov $ivd41,$ivh -+ -+ vld1.8 {$dat4},[$inp],#16 -+ veor $dat3,$dat3,$iv3 // the fourth block -+ veor $dat4,$dat4,$iv4 -+ sub $len,$len,#32 // bias -+ mov $rounds,$rounds0 -+ b .Loop5x_xts_dec -+ -+.align 4 -+.Loop5x_xts_dec: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ vld1.32 {q8},[$key_],#16 // load key schedule... -+ subs $rounds,$rounds,#2 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ vld1.32 {q9},[$key_],#16 // load key schedule... -+ b.gt .Loop5x_xts_dec -+ -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q8 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q8 -+ aesimc $dat4,$dat4 -+ subs $len,$len,#0x50 // because .Lxts_dec_tail4x -+ -+ aesd $dat0,q9 -+ aesimc $dat0,$dat -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q9 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q9 -+ aesimc $dat4,$dat4 -+ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo -+ mov $key_,$key1 -+ -+ aesd $dat0,q10 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q10 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q10 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q10 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q10 -+ aesimc $dat4,$dat4 -+ add $inp,$inp,$xoffset // x0 is adjusted in such way that -+ // at exit from the loop v1.16b-v26.16b -+ // are loaded with last "words" -+ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x -+ -+ aesd $dat0,q11 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q11 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q11 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q11 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q11 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q12 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q12 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q13 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q13 -+ aesimc $dat4,$dat4 -+ -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ aesd $dat3,q14 -+ aesimc $dat3,$dat3 -+ aesd $dat4,q14 -+ aesimc $dat4,$dat4 -+ -+ veor $tmp0,$rndlast,$iv0 -+ aesd $dat0,q15 -+ // The iv for first block of next iteration. -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ veor $tmp1,$rndlast,$iv1 -+ vld1.8 {$in0},[$inp],#16 -+ aesd $dat1,q15 -+ // The iv for second block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd10,$ivl -+ fmov $ivd11,$ivh -+ veor $tmp2,$rndlast,$iv2 -+ vld1.8 {$in1},[$inp],#16 -+ aesd $dat2,q15 -+ // The iv for third block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd20,$ivl -+ fmov $ivd21,$ivh -+ veor $tmp3,$rndlast,$iv3 -+ vld1.8 {$in2},[$inp],#16 -+ aesd $dat3,q15 -+ // The iv for fourth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd30,$ivl -+ fmov $ivd31,$ivh -+ veor $tmp4,$rndlast,$iv4 -+ vld1.8 {$in3},[$inp],#16 -+ aesd $dat4,q15 -+ -+ // The iv for fifth block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd40,$ivl -+ fmov $ivd41,$ivh -+ -+ vld1.8 {$in4},[$inp],#16 -+ cbz $xoffset,.Lxts_dec_tail4x -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ veor $tmp0,$tmp0,$dat0 -+ veor $dat0,$in0,$iv0 -+ veor $tmp1,$tmp1,$dat1 -+ veor $dat1,$in1,$iv1 -+ veor $tmp2,$tmp2,$dat2 -+ veor $dat2,$in2,$iv2 -+ veor $tmp3,$tmp3,$dat3 -+ veor $dat3,$in3,$iv3 -+ veor $tmp4,$tmp4,$dat4 -+ vst1.8 {$tmp0},[$out],#16 -+ veor $dat4,$in4,$iv4 -+ vst1.8 {$tmp1},[$out],#16 -+ mov $rounds,$rounds0 -+ vst1.8 {$tmp2},[$out],#16 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp3},[$out],#16 -+ vst1.8 {$tmp4},[$out],#16 -+ b.hs .Loop5x_xts_dec -+ -+ cmn $len,#0x10 -+ b.ne .Loop5x_dec_after -+ // If x2($len) equal to -0x10, the left blocks is 4. -+ // After specially processing, utilize the five blocks processing again. -+ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. -+ vorr $iv4,$iv3,$iv3 -+ vorr $iv3,$iv2,$iv2 -+ vorr $iv2,$iv1,$iv1 -+ vorr $iv1,$iv0,$iv0 -+ fmov $ivl,$ivd40 -+ fmov $ivh,$ivd41 -+ veor $dat0,$iv0,$in0 -+ veor $dat1,$iv1,$in1 -+ veor $dat2,$in2,$iv2 -+ veor $dat3,$in3,$iv3 -+ veor $dat4,$in4,$iv4 -+ b.eq .Loop5x_xts_dec -+ -+.Loop5x_dec_after: -+ add $len,$len,#0x50 -+ cbz $len,.Lxts_done -+ -+ add $rounds,$rounds0,#2 -+ subs $len,$len,#0x30 -+ b.lo .Lxts_inner_dec_tail -+ -+ veor $dat0,$iv0,$in2 -+ veor $dat1,$iv1,$in3 -+ veor $dat2,$in4,$iv2 -+ b .Lxts_outer_dec_tail -+ -+.align 4 -+.Lxts_dec_tail4x: -+ add $inp,$inp,#16 -+ vld1.32 {$dat0},[$inp],#16 -+ veor $tmp1,$dat1,$tmp0 -+ vst1.8 {$tmp1},[$out],#16 -+ veor $tmp2,$dat2,$tmp2 -+ vst1.8 {$tmp2},[$out],#16 -+ veor $tmp3,$dat3,$tmp3 -+ veor $tmp4,$dat4,$tmp4 -+ vst1.8 {$tmp3-$tmp4},[$out],#32 -+ -+ b .Lxts_done -+.align 4 -+.Lxts_outer_dec_tail: -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $rounds,$rounds,#2 -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lxts_outer_dec_tail -+ -+ aesd $dat0,q8 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ veor $tmp0,$iv0,$rndlast -+ subs $len,$len,#0x30 -+ // The iv for first block -+ fmov $ivl,$ivd20 -+ fmov $ivh,$ivd21 -+ mov $constnum,#0x87 -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd00,$ivl -+ fmov $ivd01,$ivh -+ veor $tmp1,$iv1,$rndlast -+ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point -+ aesd $dat0,q9 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ veor $tmp2,$iv2,$rndlast -+ // The iv for second block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd10,$ivl -+ fmov $ivd11,$ivh -+ -+ add $xoffset,$xoffset,#0x20 -+ add $inp,$inp,$xoffset // $inp is adjusted to the last data -+ -+ mov $key_,$key1 -+ -+ // The iv for third block -+ extr $midnumx,$ivh,$ivh,#32 -+ extr $ivh,$ivh,$ivl,#63 -+ and $tmpmw,$constnum,$midnum,asr #31 -+ eor $ivl,$tmpmx,$ivl,lsl #1 -+ fmov $ivd20,$ivl -+ fmov $ivd21,$ivh -+ -+ aesd $dat0,q12 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ aesd $dat0,q13 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ aesd $dat0,q14 -+ aesimc $dat0,$dat0 -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ vld1.8 {$in2},[$inp],#16 -+ aesd $dat0,q15 -+ aesd $dat1,q15 -+ aesd $dat2,q15 -+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] -+ add $rounds,$rounds0,#2 -+ veor $tmp0,$tmp0,$dat0 -+ veor $tmp1,$tmp1,$dat1 -+ veor $dat2,$dat2,$tmp2 -+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] -+ vst1.8 {$tmp0},[$out],#16 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$dat2},[$out],#16 -+ -+ cmn $len,#0x30 -+ add $len,$len,#0x30 -+ b.eq .Lxts_done -+ sub $len,$len,#0x30 -+ vorr $in3,$in1,$in1 -+ vorr $in4,$in2,$in2 -+ nop -+ -+.Lxts_inner_dec_tail: -+ // $len == -0x10 means two blocks left. -+ cmn $len,#0x10 -+ veor $dat1,$in3,$iv0 -+ veor $dat2,$in4,$iv1 -+ b.eq .Lxts_dec_tail_loop -+ veor $dat2,$in4,$iv0 -+.Lxts_dec_tail_loop: -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ vld1.32 {q8},[$key_],#16 -+ subs $rounds,$rounds,#2 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ vld1.32 {q9},[$key_],#16 -+ b.gt .Lxts_dec_tail_loop -+ -+ aesd $dat1,q8 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q8 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q9 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q9 -+ aesimc $dat2,$dat2 -+ aesd $dat1,q12 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q12 -+ aesimc $dat2,$dat2 -+ cmn $len,#0x20 -+ aesd $dat1,q13 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q13 -+ aesimc $dat2,$dat2 -+ veor $tmp1,$iv0,$rndlast -+ aesd $dat1,q14 -+ aesimc $dat1,$dat1 -+ aesd $dat2,q14 -+ aesimc $dat2,$dat2 -+ veor $tmp2,$iv1,$rndlast -+ aesd $dat1,q15 -+ aesd $dat2,q15 -+ b.eq .Lxts_dec_one -+ veor $tmp1,$tmp1,$dat1 -+ veor $tmp2,$tmp2,$dat2 -+ vorr $iv0,$iv2,$iv2 -+ vorr $iv1,$iv3,$iv3 -+ vst1.8 {$tmp1},[$out],#16 -+ vst1.8 {$tmp2},[$out],#16 -+ add $len,$len,#16 -+ b .Lxts_done -+ -+.Lxts_dec_one: -+ veor $tmp1,$tmp1,$dat2 -+ vorr $iv0,$iv1,$iv1 -+ vorr $iv1,$iv2,$iv2 -+ vst1.8 {$tmp1},[$out],#16 -+ add $len,$len,#32 -+ -+.Lxts_done: -+ tst $tailcnt,#0xf -+ b.eq .Lxts_dec_abort -+ // Processing the last two blocks with cipher stealing. -+ mov x7,x3 -+ cbnz x2,.Lxts_dec_1st_done -+ vld1.32 {$dat0},[$inp],#16 -+ -+ // Decrypt the last secod block to get the last plain text block -+.Lxts_dec_1st_done: -+ eor $tmpin,$dat0,$iv1 -+ ldr $rounds,[$key1,#240] -+ vld1.32 {$dat0},[$key1],#16 -+ sub $rounds,$rounds,#2 -+ vld1.32 {$dat1},[$key1],#16 -+.Loop_final_2nd_dec: -+ aesd $tmpin,$dat0 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key1],#16 // load key schedule... -+ subs $rounds,$rounds,#2 -+ aesd $tmpin,$dat1 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat1},[$key1],#16 // load key schedule... -+ b.gt .Loop_final_2nd_dec -+ -+ aesd $tmpin,$dat0 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key1] -+ aesd $tmpin,$dat1 -+ veor $tmpin,$tmpin,$dat0 -+ veor $tmpin,$tmpin,$iv1 -+ vst1.8 {$tmpin},[$out] -+ -+ mov $tmpinp,$inp -+ add $tmpoutp,$out,#16 -+ -+ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks -+ // to get the last encrypted block. -+.composite_dec_loop: -+ subs $tailcnt,$tailcnt,#1 -+ ldrb $l2outp,[$out,$tailcnt] -+ ldrb $loutp,[$tmpinp,$tailcnt] -+ strb $l2outp,[$tmpoutp,$tailcnt] -+ strb $loutp,[$out,$tailcnt] -+ b.gt .composite_dec_loop -+.Lxts_dec_load_done: -+ vld1.8 {$tmpin},[$out] -+ veor $tmpin,$tmpin,$iv0 -+ -+ // Decrypt the composite block to get the last second plain text block -+ ldr $rounds,[$key_,#240] -+ vld1.8 {$dat},[$key_],#16 -+ sub $rounds,$rounds,#2 -+ vld1.8 {$dat1},[$key_],#16 -+.Loop_final_dec: -+ aesd $tmpin,$dat0 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key_],#16 // load key schedule... -+ subs $rounds,$rounds,#2 -+ aesd $tmpin,$dat1 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat1},[$key_],#16 // load key schedule... -+ b.gt .Loop_final_dec -+ -+ aesd $tmpin,$dat0 -+ aesimc $tmpin,$tmpin -+ vld1.32 {$dat0},[$key_] -+ aesd $tmpin,$dat1 -+ veor $tmpin,$tmpin,$dat0 -+ veor $tmpin,$tmpin,$iv0 -+ vst1.8 {$tmpin},[$out] -+ -+.Lxts_dec_abort: -+ ldp $tailcnt,$midnumx,[sp,#48] -+ ldp $ivd10,$ivd20,[sp,#32] -+ ldp $ivd30,$ivd40,[sp,#16] -+ ldp $constnumx,$tmpinp,[sp],#64 -+ -+.Lxts_dec_final_abort: -+ ret -+.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt -+___ -+} -+}}} - $code.=<<___; - #endif - ___ -@@ -963,7 +3615,7 @@ if ($flavour =~ /64/) { ######## 64-bi - # since ARMv7 instructions are always encoded little-endian. - # correct solution is to use .inst directive, but older - # assemblers don't implement it:-( -- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", -+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", - $word&0xff,($word>>8)&0xff, - ($word>>16)&0xff,($word>>24)&0xff, - $mnemonic,$arg; -@@ -1004,14 +3656,17 @@ if ($flavour =~ /64/) { ######## 64-bi - s/\],#[0-9]+/]!/o; - - s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or -- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or -+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or - s/vtbl\.8\s+(.*)/unvtbl($1)/geo or - s/vdup\.32\s+(.*)/unvdup32($1)/geo or - s/vmov\.32\s+(.*)/unvmov32($1)/geo or - s/^(\s+)b\./$1b/o or -- s/^(\s+)mov\./$1mov/o or - s/^(\s+)ret/$1bx\tlr/o; - -+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { -+ print " it $2\n"; -+ } -+ - print $_,"\n"; - } - } -diff -up openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl ---- openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl 2020-12-09 10:37:38.405558929 +0100 -@@ -30,6 +30,7 @@ - # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] - # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] - # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] -+# ThunderX2(***) 39.4(**) 33.8/48.6(**) - # - # (*) ECB denotes approximate result for parallelizable modes - # such as CBC decrypt, CTR, etc.; -diff -up openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl ---- openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl 2020-12-09 10:40:57.922288627 +0100 -@@ -18,32 +18,44 @@ - # - # ChaCha20 for ARMv8. - # -+# April 2019 -+# -+# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest -+# option on most(*), but not all, processors, yet 6+2 is retained. -+# This is because penalties are considered tolerable in comparison to -+# improvement on processors where 6+2 helps. Most notably +37% on -+# ThunderX2. It's server-oriented processor which will have to serve -+# as many requests as possible. While others are mostly clients, when -+# performance doesn't have to be absolute top-notch, just fast enough, -+# as majority of time is spent "entertaining" relatively slow human. -+# - # Performance in cycles per byte out of large buffer. - # --# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU -+# IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU - # --# Apple A7 5.50/+49% 3.33 1.70 --# Cortex-A53 8.40/+80% 4.72 4.72(*) --# Cortex-A57 8.06/+43% 4.90 4.43(**) --# Denver 4.50/+82% 2.63 2.67(*) --# X-Gene 9.50/+46% 8.82 8.89(*) --# Mongoose 8.00/+44% 3.64 3.25 --# Kryo 8.17/+50% 4.83 4.65 -+# Apple A7 5.50/+49% 2.72 1.60 -+# Cortex-A53 8.40/+80% 4.06 4.45(*) -+# Cortex-A57 8.06/+43% 4.15 4.40(*) -+# Denver 4.50/+82% 2.30 2.70(*) -+# X-Gene 9.50/+46% 8.20 8.90(*) -+# Mongoose 8.00/+44% 2.74 3.12(*) -+# Kryo 8.17/+50% 4.47 4.65(*) -+# ThunderX2 7.22/+48% 5.64 4.10 - # --# (*) it's expected that doubling interleave factor doesn't help --# all processors, only those with higher NEON latency and --# higher instruction issue rate; --# (**) expected improvement was actually higher; -+# (*) slower than 4+1:-( - --$flavour=shift; --$output=shift; -+# $output is the last argument if it looks like a file (it has an extension) -+# $flavour is the first argument if it doesn't look like a file -+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; -+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; - - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - --open OUT,"| \"$^X\" $xlate $flavour $output"; -+open OUT,"| \"$^X\" $xlate $flavour \"$output\"" -+ or die "can't call $xlate: $!"; - *STDOUT=*OUT; - - sub AUTOLOAD() # thunk [simplified] x86-style perlasm -@@ -120,42 +132,37 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) - } - - $code.=<<___; --#include "arm_arch.h" -- --.text -- -+#ifndef __KERNEL__ -+# include "arm_arch.h" - .extern OPENSSL_armcap_P - .hidden OPENSSL_armcap_P -+#endif -+ -+.text - - .align 5 - .Lsigma: - .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral - .Lone: --.long 1,0,0,0 --.LOPENSSL_armcap_P: --#ifdef __ILP32__ --.long OPENSSL_armcap_P-. --#else --.quad OPENSSL_armcap_P-. --#endif --.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " -+.long 1,2,3,4 -+.Lrot24: -+.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f -+.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm" - - .globl ChaCha20_ctr32 - .type ChaCha20_ctr32,%function - .align 5 - ChaCha20_ctr32: - cbz $len,.Labort -- adr @x[0],.LOPENSSL_armcap_P - cmp $len,#192 - b.lo .Lshort --#ifdef __ILP32__ -- ldrsw @x[1],[@x[0]] --#else -- ldr @x[1],[@x[0]] --#endif -- ldr w17,[@x[1],@x[0]] -+ -+#ifndef __KERNEL__ -+ adrp x17,OPENSSL_armcap_P -+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P] - tst w17,#ARMV7_NEON -- b.ne ChaCha20_neon -+ b.ne .LChaCha20_neon -+#endif - - .Lshort: - .inst 0xd503233f // paciasp -@@ -174,7 +181,7 @@ ChaCha20_ctr32: - ldp @d[2],@d[3],[$key] // load key - ldp @d[4],@d[5],[$key,#16] - ldp @d[6],@d[7],[$ctr] // load counter --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 - ror @d[4],@d[4],#32 -@@ -243,7 +250,7 @@ $code.=<<___; - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -300,7 +307,7 @@ $code.=<<___; - add @x[10],@x[10],@x[11],lsl#32 - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -341,46 +348,91 @@ $code.=<<___; - ___ - - {{{ --my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = -- map("v$_.4s",(0..7,16..23)); --my (@K)=map("v$_.4s",(24..30)); --my $ONE="v31.4s"; -+my @K = map("v$_.4s",(0..3)); -+my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9)); -+my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31)); -+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, -+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X; - --sub NEONROUND { --my $odd = pop; --my ($a,$b,$c,$d,$t)=@_; -+sub NEON_lane_ROUND { -+my ($a0,$b0,$c0,$d0)=@_; -+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); -+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); -+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); -+my @x=map("'$_'",@X); - - ( -- "&add ('$a','$a','$b')", -- "&eor ('$d','$d','$a')", -- "&rev32_16 ('$d','$d')", # vrot ($d,16) -- -- "&add ('$c','$c','$d')", -- "&eor ('$t','$b','$c')", -- "&ushr ('$b','$t',20)", -- "&sli ('$b','$t',12)", -- -- "&add ('$a','$a','$b')", -- "&eor ('$t','$d','$a')", -- "&ushr ('$d','$t',24)", -- "&sli ('$d','$t',8)", -- -- "&add ('$c','$c','$d')", -- "&eor ('$t','$b','$c')", -- "&ushr ('$b','$t',25)", -- "&sli ('$b','$t',7)", -- -- "&ext ('$c','$c','$c',8)", -- "&ext ('$d','$d','$d',$odd?4:12)", -- "&ext ('$b','$b','$b',$odd?12:4)" -+ "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1 -+ "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2 -+ "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3 -+ "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4 -+ "&eor (@x[$d0],@x[$d0],@x[$a0])", -+ "&eor (@x[$d1],@x[$d1],@x[$a1])", -+ "&eor (@x[$d2],@x[$d2],@x[$a2])", -+ "&eor (@x[$d3],@x[$d3],@x[$a3])", -+ "&rev32_16 (@x[$d0],@x[$d0])", -+ "&rev32_16 (@x[$d1],@x[$d1])", -+ "&rev32_16 (@x[$d2],@x[$d2])", -+ "&rev32_16 (@x[$d3],@x[$d3])", -+ -+ "&add (@x[$c0],@x[$c0],@x[$d0])", -+ "&add (@x[$c1],@x[$c1],@x[$d1])", -+ "&add (@x[$c2],@x[$c2],@x[$d2])", -+ "&add (@x[$c3],@x[$c3],@x[$d3])", -+ "&eor ('$xt0',@x[$b0],@x[$c0])", -+ "&eor ('$xt1',@x[$b1],@x[$c1])", -+ "&eor ('$xt2',@x[$b2],@x[$c2])", -+ "&eor ('$xt3',@x[$b3],@x[$c3])", -+ "&ushr (@x[$b0],'$xt0',20)", -+ "&ushr (@x[$b1],'$xt1',20)", -+ "&ushr (@x[$b2],'$xt2',20)", -+ "&ushr (@x[$b3],'$xt3',20)", -+ "&sli (@x[$b0],'$xt0',12)", -+ "&sli (@x[$b1],'$xt1',12)", -+ "&sli (@x[$b2],'$xt2',12)", -+ "&sli (@x[$b3],'$xt3',12)", -+ -+ "&add (@x[$a0],@x[$a0],@x[$b0])", -+ "&add (@x[$a1],@x[$a1],@x[$b1])", -+ "&add (@x[$a2],@x[$a2],@x[$b2])", -+ "&add (@x[$a3],@x[$a3],@x[$b3])", -+ "&eor ('$xt0',@x[$d0],@x[$a0])", -+ "&eor ('$xt1',@x[$d1],@x[$a1])", -+ "&eor ('$xt2',@x[$d2],@x[$a2])", -+ "&eor ('$xt3',@x[$d3],@x[$a3])", -+ "&tbl (@x[$d0],'{$xt0}','$ROT24')", -+ "&tbl (@x[$d1],'{$xt1}','$ROT24')", -+ "&tbl (@x[$d2],'{$xt2}','$ROT24')", -+ "&tbl (@x[$d3],'{$xt3}','$ROT24')", -+ -+ "&add (@x[$c0],@x[$c0],@x[$d0])", -+ "&add (@x[$c1],@x[$c1],@x[$d1])", -+ "&add (@x[$c2],@x[$c2],@x[$d2])", -+ "&add (@x[$c3],@x[$c3],@x[$d3])", -+ "&eor ('$xt0',@x[$b0],@x[$c0])", -+ "&eor ('$xt1',@x[$b1],@x[$c1])", -+ "&eor ('$xt2',@x[$b2],@x[$c2])", -+ "&eor ('$xt3',@x[$b3],@x[$c3])", -+ "&ushr (@x[$b0],'$xt0',25)", -+ "&ushr (@x[$b1],'$xt1',25)", -+ "&ushr (@x[$b2],'$xt2',25)", -+ "&ushr (@x[$b3],'$xt3',25)", -+ "&sli (@x[$b0],'$xt0',7)", -+ "&sli (@x[$b1],'$xt1',7)", -+ "&sli (@x[$b2],'$xt2',7)", -+ "&sli (@x[$b3],'$xt3',7)" - ); - } - - $code.=<<___; - -+#ifdef __KERNEL__ -+.globl ChaCha20_neon -+#endif - .type ChaCha20_neon,%function - .align 5 - ChaCha20_neon: -+.LChaCha20_neon: - .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-96]! - add x29,sp,#0 -@@ -403,8 +455,9 @@ ChaCha20_neon: - ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] -- ld1 {$ONE},[@x[0]] --#ifdef __ARMEB__ -+ stp d8,d9,[sp] // meet ABI requirements -+ ld1 {$CTR,$ROT24},[@x[0]] -+#ifdef __AARCH64EB__ - rev64 @K[0],@K[0] - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 -@@ -413,115 +466,129 @@ ChaCha20_neon: - ror @d[6],@d[6],#32 - ror @d[7],@d[7],#32 - #endif -- add @K[3],@K[3],$ONE // += 1 -- add @K[4],@K[3],$ONE -- add @K[5],@K[4],$ONE -- shl $ONE,$ONE,#2 // 1 -> 4 - - .Loop_outer_neon: -- mov.32 @x[0],@d[0] // unpack key block -- lsr @x[1],@d[0],#32 -- mov $A0,@K[0] -- mov.32 @x[2],@d[1] -- lsr @x[3],@d[1],#32 -- mov $A1,@K[0] -- mov.32 @x[4],@d[2] -- lsr @x[5],@d[2],#32 -- mov $A2,@K[0] -- mov.32 @x[6],@d[3] -- mov $B0,@K[1] -- lsr @x[7],@d[3],#32 -- mov $B1,@K[1] -- mov.32 @x[8],@d[4] -- mov $B2,@K[1] -- lsr @x[9],@d[4],#32 -- mov $D0,@K[3] -- mov.32 @x[10],@d[5] -- mov $D1,@K[4] -- lsr @x[11],@d[5],#32 -- mov $D2,@K[5] -- mov.32 @x[12],@d[6] -- mov $C0,@K[2] -- lsr @x[13],@d[6],#32 -- mov $C1,@K[2] -- mov.32 @x[14],@d[7] -- mov $C2,@K[2] -- lsr @x[15],@d[7],#32 -+ dup $xa0,@{K[0]}[0] // unpack key block -+ mov.32 @x[0],@d[0] -+ dup $xa1,@{K[0]}[1] -+ lsr @x[1],@d[0],#32 -+ dup $xa2,@{K[0]}[2] -+ mov.32 @x[2],@d[1] -+ dup $xa3,@{K[0]}[3] -+ lsr @x[3],@d[1],#32 -+ dup $xb0,@{K[1]}[0] -+ mov.32 @x[4],@d[2] -+ dup $xb1,@{K[1]}[1] -+ lsr @x[5],@d[2],#32 -+ dup $xb2,@{K[1]}[2] -+ mov.32 @x[6],@d[3] -+ dup $xb3,@{K[1]}[3] -+ lsr @x[7],@d[3],#32 -+ dup $xd0,@{K[3]}[0] -+ mov.32 @x[8],@d[4] -+ dup $xd1,@{K[3]}[1] -+ lsr @x[9],@d[4],#32 -+ dup $xd2,@{K[3]}[2] -+ mov.32 @x[10],@d[5] -+ dup $xd3,@{K[3]}[3] -+ lsr @x[11],@d[5],#32 -+ add $xd0,$xd0,$CTR -+ mov.32 @x[12],@d[6] -+ dup $xc0,@{K[2]}[0] -+ lsr @x[13],@d[6],#32 -+ dup $xc1,@{K[2]}[1] -+ mov.32 @x[14],@d[7] -+ dup $xc2,@{K[2]}[2] -+ lsr @x[15],@d[7],#32 -+ dup $xc3,@{K[2]}[3] - - mov $ctr,#10 -- subs $len,$len,#256 -+ subs $len,$len,#320 - .Loop_neon: - sub $ctr,$ctr,#1 - ___ -- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); -- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); -- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); -- my @thread3=&ROUND(0,4,8,12); -- -- foreach (@thread0) { -- eval; eval(shift(@thread3)); -- eval(shift(@thread1)); eval(shift(@thread3)); -- eval(shift(@thread2)); eval(shift(@thread3)); -- } -- -- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); -- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); -- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); -- @thread3=&ROUND(0,5,10,15); -+ my @plus_one=&ROUND(0,4,8,12); -+ foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); } - -- foreach (@thread0) { -- eval; eval(shift(@thread3)); -- eval(shift(@thread1)); eval(shift(@thread3)); -- eval(shift(@thread2)); eval(shift(@thread3)); -- } -+ @plus_one=&ROUND(0,5,10,15); -+ foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); } - $code.=<<___; - cbnz $ctr,.Loop_neon - -- add.32 @x[0],@x[0],@d[0] // accumulate key block -- add $A0,$A0,@K[0] -- add @x[1],@x[1],@d[0],lsr#32 -- add $A1,$A1,@K[0] -- add.32 @x[2],@x[2],@d[1] -- add $A2,$A2,@K[0] -- add @x[3],@x[3],@d[1],lsr#32 -- add $C0,$C0,@K[2] -- add.32 @x[4],@x[4],@d[2] -- add $C1,$C1,@K[2] -- add @x[5],@x[5],@d[2],lsr#32 -- add $C2,$C2,@K[2] -- add.32 @x[6],@x[6],@d[3] -- add $D0,$D0,@K[3] -- add @x[7],@x[7],@d[3],lsr#32 -- add.32 @x[8],@x[8],@d[4] -- add $D1,$D1,@K[4] -- add @x[9],@x[9],@d[4],lsr#32 -- add.32 @x[10],@x[10],@d[5] -- add $D2,$D2,@K[5] -- add @x[11],@x[11],@d[5],lsr#32 -- add.32 @x[12],@x[12],@d[6] -- add $B0,$B0,@K[1] -- add @x[13],@x[13],@d[6],lsr#32 -- add.32 @x[14],@x[14],@d[7] -- add $B1,$B1,@K[1] -- add @x[15],@x[15],@d[7],lsr#32 -- add $B2,$B2,@K[1] -+ add $xd0,$xd0,$CTR -+ -+ zip1 $xt0,$xa0,$xa1 // transpose data -+ zip1 $xt1,$xa2,$xa3 -+ zip2 $xt2,$xa0,$xa1 -+ zip2 $xt3,$xa2,$xa3 -+ zip1.64 $xa0,$xt0,$xt1 -+ zip2.64 $xa1,$xt0,$xt1 -+ zip1.64 $xa2,$xt2,$xt3 -+ zip2.64 $xa3,$xt2,$xt3 -+ -+ zip1 $xt0,$xb0,$xb1 -+ zip1 $xt1,$xb2,$xb3 -+ zip2 $xt2,$xb0,$xb1 -+ zip2 $xt3,$xb2,$xb3 -+ zip1.64 $xb0,$xt0,$xt1 -+ zip2.64 $xb1,$xt0,$xt1 -+ zip1.64 $xb2,$xt2,$xt3 -+ zip2.64 $xb3,$xt2,$xt3 -+ -+ zip1 $xt0,$xc0,$xc1 -+ add.32 @x[0],@x[0],@d[0] // accumulate key block -+ zip1 $xt1,$xc2,$xc3 -+ add @x[1],@x[1],@d[0],lsr#32 -+ zip2 $xt2,$xc0,$xc1 -+ add.32 @x[2],@x[2],@d[1] -+ zip2 $xt3,$xc2,$xc3 -+ add @x[3],@x[3],@d[1],lsr#32 -+ zip1.64 $xc0,$xt0,$xt1 -+ add.32 @x[4],@x[4],@d[2] -+ zip2.64 $xc1,$xt0,$xt1 -+ add @x[5],@x[5],@d[2],lsr#32 -+ zip1.64 $xc2,$xt2,$xt3 -+ add.32 @x[6],@x[6],@d[3] -+ zip2.64 $xc3,$xt2,$xt3 -+ add @x[7],@x[7],@d[3],lsr#32 -+ -+ zip1 $xt0,$xd0,$xd1 -+ add.32 @x[8],@x[8],@d[4] -+ zip1 $xt1,$xd2,$xd3 -+ add @x[9],@x[9],@d[4],lsr#32 -+ zip2 $xt2,$xd0,$xd1 -+ add.32 @x[10],@x[10],@d[5] -+ zip2 $xt3,$xd2,$xd3 -+ add @x[11],@x[11],@d[5],lsr#32 -+ zip1.64 $xd0,$xt0,$xt1 -+ add.32 @x[12],@x[12],@d[6] -+ zip2.64 $xd1,$xt0,$xt1 -+ add @x[13],@x[13],@d[6],lsr#32 -+ zip1.64 $xd2,$xt2,$xt3 -+ add.32 @x[14],@x[14],@d[7] -+ zip2.64 $xd3,$xt2,$xt3 -+ add @x[15],@x[15],@d[7],lsr#32 - - b.lo .Ltail_neon - - add @x[0],@x[0],@x[1],lsl#32 // pack - add @x[2],@x[2],@x[3],lsl#32 - ldp @x[1],@x[3],[$inp,#0] // load input -+ add $xa0,$xa0,@K[0] // accumulate key block - add @x[4],@x[4],@x[5],lsl#32 - add @x[6],@x[6],@x[7],lsl#32 - ldp @x[5],@x[7],[$inp,#16] -+ add $xb0,$xb0,@K[1] - add @x[8],@x[8],@x[9],lsl#32 - add @x[10],@x[10],@x[11],lsl#32 - ldp @x[9],@x[11],[$inp,#32] -+ add $xc0,$xc0,@K[2] - add @x[12],@x[12],@x[13],lsl#32 - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] -+ add $xd0,$xd0,@K[3] - add $inp,$inp,#64 --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -531,48 +598,68 @@ $code.=<<___; - rev @x[12],@x[12] - rev @x[14],@x[14] - #endif -- ld1.8 {$T0-$T3},[$inp],#64 -+ ld1.8 {$xt0-$xt3},[$inp],#64 - eor @x[0],@x[0],@x[1] -+ add $xa1,$xa1,@K[0] - eor @x[2],@x[2],@x[3] -+ add $xb1,$xb1,@K[1] - eor @x[4],@x[4],@x[5] -+ add $xc1,$xc1,@K[2] - eor @x[6],@x[6],@x[7] -+ add $xd1,$xd1,@K[3] - eor @x[8],@x[8],@x[9] -- eor $A0,$A0,$T0 -+ eor $xa0,$xa0,$xt0 -+ movi $xt0,#5 - eor @x[10],@x[10],@x[11] -- eor $B0,$B0,$T1 -+ eor $xb0,$xb0,$xt1 - eor @x[12],@x[12],@x[13] -- eor $C0,$C0,$T2 -+ eor $xc0,$xc0,$xt2 - eor @x[14],@x[14],@x[15] -- eor $D0,$D0,$T3 -- ld1.8 {$T0-$T3},[$inp],#64 -+ eor $xd0,$xd0,$xt3 -+ add $CTR,$CTR,$xt0 // += 5 -+ ld1.8 {$xt0-$xt3},[$inp],#64 - - stp @x[0],@x[2],[$out,#0] // store output -- add @d[6],@d[6],#4 // increment counter -+ add @d[6],@d[6],#5 // increment counter - stp @x[4],@x[6],[$out,#16] -- add @K[3],@K[3],$ONE // += 4 - stp @x[8],@x[10],[$out,#32] -- add @K[4],@K[4],$ONE - stp @x[12],@x[14],[$out,#48] -- add @K[5],@K[5],$ONE - add $out,$out,#64 - -- st1.8 {$A0-$D0},[$out],#64 -- ld1.8 {$A0-$D0},[$inp],#64 -- -- eor $A1,$A1,$T0 -- eor $B1,$B1,$T1 -- eor $C1,$C1,$T2 -- eor $D1,$D1,$T3 -- st1.8 {$A1-$D1},[$out],#64 -- -- eor $A2,$A2,$A0 -- eor $B2,$B2,$B0 -- eor $C2,$C2,$C0 -- eor $D2,$D2,$D0 -- st1.8 {$A2-$D2},[$out],#64 -+ st1.8 {$xa0-$xd0},[$out],#64 -+ add $xa2,$xa2,@K[0] -+ add $xb2,$xb2,@K[1] -+ add $xc2,$xc2,@K[2] -+ add $xd2,$xd2,@K[3] -+ ld1.8 {$xa0-$xd0},[$inp],#64 -+ -+ eor $xa1,$xa1,$xt0 -+ eor $xb1,$xb1,$xt1 -+ eor $xc1,$xc1,$xt2 -+ eor $xd1,$xd1,$xt3 -+ st1.8 {$xa1-$xd1},[$out],#64 -+ add $xa3,$xa3,@K[0] -+ add $xb3,$xb3,@K[1] -+ add $xc3,$xc3,@K[2] -+ add $xd3,$xd3,@K[3] -+ ld1.8 {$xa1-$xd1},[$inp],#64 -+ -+ eor $xa2,$xa2,$xa0 -+ eor $xb2,$xb2,$xb0 -+ eor $xc2,$xc2,$xc0 -+ eor $xd2,$xd2,$xd0 -+ st1.8 {$xa2-$xd2},[$out],#64 -+ -+ eor $xa3,$xa3,$xa1 -+ eor $xb3,$xb3,$xb1 -+ eor $xc3,$xc3,$xc1 -+ eor $xd3,$xd3,$xd1 -+ st1.8 {$xa3-$xd3},[$out],#64 - - b.hi .Loop_outer_neon - -+ ldp d8,d9,[sp] // meet ABI requirements -+ - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] -@@ -583,8 +670,10 @@ $code.=<<___; - .inst 0xd50323bf // autiasp - ret - -+.align 4 - .Ltail_neon: -- add $len,$len,#256 -+ add $len,$len,#320 -+ ldp d8,d9,[sp] // meet ABI requirements - cmp $len,#64 - b.lo .Less_than_64 - -@@ -601,7 +690,7 @@ $code.=<<___; - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -621,48 +710,68 @@ $code.=<<___; - eor @x[14],@x[14],@x[15] - - stp @x[0],@x[2],[$out,#0] // store output -- add @d[6],@d[6],#4 // increment counter -+ add $xa0,$xa0,@K[0] // accumulate key block - stp @x[4],@x[6],[$out,#16] -+ add $xb0,$xb0,@K[1] - stp @x[8],@x[10],[$out,#32] -+ add $xc0,$xc0,@K[2] - stp @x[12],@x[14],[$out,#48] -+ add $xd0,$xd0,@K[3] - add $out,$out,#64 - b.eq .Ldone_neon - sub $len,$len,#64 - cmp $len,#64 -- b.lo .Less_than_128 -+ b.lo .Last_neon - -- ld1.8 {$T0-$T3},[$inp],#64 -- eor $A0,$A0,$T0 -- eor $B0,$B0,$T1 -- eor $C0,$C0,$T2 -- eor $D0,$D0,$T3 -- st1.8 {$A0-$D0},[$out],#64 -+ ld1.8 {$xt0-$xt3},[$inp],#64 -+ eor $xa0,$xa0,$xt0 -+ eor $xb0,$xb0,$xt1 -+ eor $xc0,$xc0,$xt2 -+ eor $xd0,$xd0,$xt3 -+ st1.8 {$xa0-$xd0},[$out],#64 - b.eq .Ldone_neon -+ -+ add $xa0,$xa1,@K[0] -+ add $xb0,$xb1,@K[1] - sub $len,$len,#64 -+ add $xc0,$xc1,@K[2] - cmp $len,#64 -- b.lo .Less_than_192 -+ add $xd0,$xd1,@K[3] -+ b.lo .Last_neon - -- ld1.8 {$T0-$T3},[$inp],#64 -- eor $A1,$A1,$T0 -- eor $B1,$B1,$T1 -- eor $C1,$C1,$T2 -- eor $D1,$D1,$T3 -- st1.8 {$A1-$D1},[$out],#64 -+ ld1.8 {$xt0-$xt3},[$inp],#64 -+ eor $xa1,$xa0,$xt0 -+ eor $xb1,$xb0,$xt1 -+ eor $xc1,$xc0,$xt2 -+ eor $xd1,$xd0,$xt3 -+ st1.8 {$xa1-$xd1},[$out],#64 - b.eq .Ldone_neon -+ -+ add $xa0,$xa2,@K[0] -+ add $xb0,$xb2,@K[1] - sub $len,$len,#64 -+ add $xc0,$xc2,@K[2] -+ cmp $len,#64 -+ add $xd0,$xd2,@K[3] -+ b.lo .Last_neon - -- st1.8 {$A2-$D2},[sp] -- b .Last_neon -+ ld1.8 {$xt0-$xt3},[$inp],#64 -+ eor $xa2,$xa0,$xt0 -+ eor $xb2,$xb0,$xt1 -+ eor $xc2,$xc0,$xt2 -+ eor $xd2,$xd0,$xt3 -+ st1.8 {$xa2-$xd2},[$out],#64 -+ b.eq .Ldone_neon - --.Less_than_128: -- st1.8 {$A0-$D0},[sp] -- b .Last_neon --.Less_than_192: -- st1.8 {$A1-$D1},[sp] -- b .Last_neon -+ add $xa0,$xa3,@K[0] -+ add $xb0,$xb3,@K[1] -+ add $xc0,$xc3,@K[2] -+ add $xd0,$xd3,@K[3] -+ sub $len,$len,#64 - --.align 4 - .Last_neon: -+ st1.8 {$xa0-$xd0},[sp] -+ - sub $out,$out,#1 - add $inp,$inp,$len - add $out,$out,$len -@@ -695,9 +804,41 @@ $code.=<<___; - .size ChaCha20_neon,.-ChaCha20_neon - ___ - { -+my @K = map("v$_.4s",(0..6)); - my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; - my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, -- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); -+ $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31)); -+my $rot24 = @K[6]; -+my $ONE = "v7.4s"; -+ -+sub NEONROUND { -+my $odd = pop; -+my ($a,$b,$c,$d,$t)=@_; -+ -+ ( -+ "&add ('$a','$a','$b')", -+ "&eor ('$d','$d','$a')", -+ "&rev32_16 ('$d','$d')", # vrot ($d,16) -+ -+ "&add ('$c','$c','$d')", -+ "&eor ('$t','$b','$c')", -+ "&ushr ('$b','$t',20)", -+ "&sli ('$b','$t',12)", -+ -+ "&add ('$a','$a','$b')", -+ "&eor ('$d','$d','$a')", -+ "&tbl ('$d','{$d}','$rot24')", -+ -+ "&add ('$c','$c','$d')", -+ "&eor ('$t','$b','$c')", -+ "&ushr ('$b','$t',25)", -+ "&sli ('$b','$t',7)", -+ -+ "&ext ('$c','$c','$c',8)", -+ "&ext ('$d','$d','$d',$odd?4:12)", -+ "&ext ('$b','$b','$b',$odd?12:4)" -+ ); -+} - - $code.=<<___; - .type ChaCha20_512_neon,%function -@@ -717,6 +858,7 @@ ChaCha20_512_neon: - .L512_or_more_neon: - sub sp,sp,#128+64 - -+ eor $ONE,$ONE,$ONE - ldp @d[0],@d[1],[@x[0]] // load sigma - ld1 {@K[0]},[@x[0]],#16 - ldp @d[2],@d[3],[$key] // load key -@@ -724,8 +866,9 @@ ChaCha20_512_neon: - ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] -- ld1 {$ONE},[@x[0]] --#ifdef __ARMEB__ -+ ld1 {$ONE}[0],[@x[0]] -+ add $key,@x[0],#16 // .Lrot24 -+#ifdef __AARCH64EB__ - rev64 @K[0],@K[0] - ror @d[2],@d[2],#32 - ror @d[3],@d[3],#32 -@@ -792,9 +935,10 @@ ChaCha20_512_neon: - mov $C4,@K[2] - stp @K[3],@K[4],[sp,#48] // off-load key block, variable part - mov $C5,@K[2] -- str @K[5],[sp,#80] -+ stp @K[5],@K[6],[sp,#80] - - mov $ctr,#5 -+ ld1 {$rot24},[$key] - subs $len,$len,#512 - .Loop_upper_neon: - sub $ctr,$ctr,#1 -@@ -867,7 +1011,7 @@ $code.=<<___; - add @x[14],@x[14],@x[15],lsl#32 - ldp @x[13],@x[15],[$inp,#48] - add $inp,$inp,#64 --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -956,6 +1100,7 @@ $code.=<<___; - add.32 @x[2],@x[2],@d[1] - ldp @K[4],@K[5],[sp,#64] - add @x[3],@x[3],@d[1],lsr#32 -+ ldr @K[6],[sp,#96] - add $A0,$A0,@K[0] - add.32 @x[4],@x[4],@d[2] - add $A1,$A1,@K[0] -@@ -1008,7 +1153,7 @@ $code.=<<___; - add $inp,$inp,#64 - add $B5,$B5,@K[1] - --#ifdef __ARMEB__ -+#ifdef __AARCH64EB__ - rev @x[0],@x[0] - rev @x[2],@x[2] - rev @x[4],@x[4] -@@ -1086,26 +1231,26 @@ $code.=<<___; - b.hs .Loop_outer_512_neon - - adds $len,$len,#512 -- ushr $A0,$ONE,#2 // 4 -> 1 -+ ushr $ONE,$ONE,#1 // 4 -> 2 - -- ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - -- stp @K[0],$ONE,[sp,#0] // wipe off-load area -- stp @K[0],$ONE,[sp,#32] -- stp @K[0],$ONE,[sp,#64] -+ stp @K[0],@K[0],[sp,#0] // wipe off-load area -+ stp @K[0],@K[0],[sp,#32] -+ stp @K[0],@K[0],[sp,#64] - - b.eq .Ldone_512_neon - -+ sub $key,$key,#16 // .Lone - cmp $len,#192 -- sub @K[3],@K[3],$A0 // -= 1 -- sub @K[4],@K[4],$A0 -- sub @K[5],@K[5],$A0 - add sp,sp,#128 -+ sub @K[3],@K[3],$ONE // -= 2 -+ ld1 {$CTR,$ROT24},[$key] - b.hs .Loop_outer_neon - -+ ldp d8,d9,[sp,#0] // meet ABI requirements - eor @K[1],@K[1],@K[1] - eor @K[2],@K[2],@K[2] - eor @K[3],@K[3],@K[3] -@@ -1115,6 +1260,7 @@ $code.=<<___; - b .Loop_outer - - .Ldone_512_neon: -+ ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] -@@ -1133,9 +1279,11 @@ foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or -- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or -+ (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or - (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or - (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or -+ (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or -+ (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or - (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); - - #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; -diff -up openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl ---- openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl 2020-12-09 10:37:38.408558954 +0100 -@@ -42,6 +42,7 @@ - # Denver 0.51 0.65 6.02 - # Mongoose 0.65 1.10 8.06 - # Kryo 0.76 1.16 8.00 -+# ThunderX2 1.05 - # - # (*) presented for reference/comparison purposes; - -diff -up openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl ---- openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl 2020-12-09 10:37:38.408558954 +0100 -@@ -29,6 +29,7 @@ - # X-Gene 2.13/+68% 2.27 - # Mongoose 1.77/+75% 1.12 - # Kryo 2.70/+55% 1.13 -+# ThunderX2 1.17/+95% 1.36 - # - # (*) estimate based on resources availability is less than 1.0, - # i.e. measured result is worse than expected, presumably binary -diff -up openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl ---- openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl 2020-12-09 10:37:38.408558954 +0100 -@@ -51,6 +51,7 @@ - # Kryo 12 - # Denver 7.8 - # Apple A7 7.2 -+# ThunderX2 9.7 - # - # (*) Corresponds to SHA3-256. No improvement coefficients are listed - # because they vary too much from compiler to compiler. Newer -diff -up openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl ---- openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl 2020-12-09 10:37:38.408558954 +0100 -@@ -27,6 +27,7 @@ - # X-Gene 8.80 (+200%) - # Mongoose 2.05 6.50 (+160%) - # Kryo 1.88 8.00 (+90%) -+# ThunderX2 2.64 6.36 (+150%) - # - # (*) Software results are presented mostly for reference purposes. - # (**) Keep in mind that Denver relies on binary translation, which -diff -up openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl ---- openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 -+++ openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl 2020-12-09 10:37:38.408558954 +0100 -@@ -28,6 +28,7 @@ - # X-Gene 20.0 (+100%) 12.8 (+300%(***)) - # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) - # Kryo 1.92 17.4 (+30%) 11.2 (+8%) -+# ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) - # - # (*) Software SHA256 results are of lesser relevance, presented - # mostly for informational purposes. diff --git a/rpm/openssl-1.1.1-evp-kdf.patch b/rpm/openssl-1.1.1-evp-kdf.patch index 6145753..dab4a0c 100644 --- a/rpm/openssl-1.1.1-evp-kdf.patch +++ b/rpm/openssl-1.1.1-evp-kdf.patch @@ -4474,13 +4474,6 @@ diff -up openssl-1.1.1j/test/pkey_meth_kdf_test.c.evp-kdf openssl-1.1.1j/test/pk diff -up openssl-1.1.1j/test/recipes/30-test_evp_data/evpkdf.txt.evp-kdf openssl-1.1.1j/test/recipes/30-test_evp_data/evpkdf.txt --- openssl-1.1.1j/test/recipes/30-test_evp_data/evpkdf.txt.evp-kdf 2021-02-16 16:24:01.000000000 +0100 +++ openssl-1.1.1j/test/recipes/30-test_evp_data/evpkdf.txt 2021-03-03 14:08:02.494294874 +0100 -@@ -1,5 +1,5 @@ - # --# Copyright 2001-2017 The OpenSSL Project Authors. All Rights Reserved. -+# Copyright 2001-2018 The OpenSSL Project Authors. All Rights Reserved. - # - # Licensed under the OpenSSL license (the "License"). You may not use - # this file except in compliance with the License. You can obtain a copy @@ -15,7 +15,7 @@ Title = TLS1 PRF tests (from NIST test vectors) @@ -4740,7 +4733,7 @@ diff -up openssl-1.1.1j/test/recipes/30-test_evp_data/evpkdf.txt.evp-kdf openssl Output = 2c91117204d745f3500d636a62f64f0ab3bae548aa53d423b0d1f27ebba6f5e5673a081d70cce7acfc48 @@ -303,3 +303,133 @@ Ctrl.r = r:8 Ctrl.p = p:1 - Result = INTERNAL_ERROR + Result = KDF_DERIVE_ERROR +Title = PBKDF2 tests + diff --git a/rpm/openssl-1.1.1-fips.patch b/rpm/openssl-1.1.1-fips.patch index aa3d33d..fffc973 100644 --- a/rpm/openssl-1.1.1-fips.patch +++ b/rpm/openssl-1.1.1-fips.patch @@ -870,8 +870,8 @@ diff -up openssl-1.1.1j/crypto/evp/digest.c.fips openssl-1.1.1j/crypto/evp/diges +# include +#endif - /* This call frees resources associated with the context */ - int EVP_MD_CTX_reset(EVP_MD_CTX *ctx) + + static void cleanup_old_md_data(EVP_MD_CTX *ctx, int force) @@ -66,6 +69,12 @@ int EVP_DigestInit(EVP_MD_CTX *ctx, cons int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { @@ -898,9 +898,9 @@ diff -up openssl-1.1.1j/crypto/evp/digest.c.fips openssl-1.1.1j/crypto/evp/diges + } + } +#endif - if (ctx->digest && ctx->digest->ctx_size) { - OPENSSL_clear_free(ctx->md_data, ctx->digest->ctx_size); - ctx->md_data = NULL; + cleanup_old_md_data(ctx, 1); + + ctx->digest = type; @@ -150,6 +168,10 @@ int EVP_DigestInit_ex(EVP_MD_CTX *ctx, c int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count) diff --git a/rpm/openssl-1.1.1-s390x-ecc.patch b/rpm/openssl-1.1.1-s390x-ecc.patch index 6b5963f..e53bd59 100644 --- a/rpm/openssl-1.1.1-s390x-ecc.patch +++ b/rpm/openssl-1.1.1-s390x-ecc.patch @@ -1927,7 +1927,7 @@ diff -up openssl-1.1.1g/crypto/s390x_arch.h.s390x-ecc openssl-1.1.1g/crypto/s390 + unsigned long long kdsa[2]; }; - extern struct OPENSSL_s390xcap_st OPENSSL_s390xcap_P; + #if defined(__GNUC__) && defined(__linux) @@ -66,11 +74,14 @@ extern struct OPENSSL_s390xcap_st OPENSS # define S390X_KMF 0x90 # define S390X_PRNO 0xa0 diff --git a/rpm/openssl-1.1.1-system-cipherlist.patch b/rpm/openssl-1.1.1-system-cipherlist.patch index a70f20b..f4b69a3 100644 --- a/rpm/openssl-1.1.1-system-cipherlist.patch +++ b/rpm/openssl-1.1.1-system-cipherlist.patch @@ -238,7 +238,7 @@ diff -up openssl-1.1.1c/ssl/ssl_ciph.c.system-cipherlist openssl-1.1.1c/ssl/ssl_ } /* -@@ -1592,14 +1648,18 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_ +@@ -1592,10 +1648,13 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_ * if we cannot get one. */ if ((cipherstack = sk_SSL_CIPHER_new_null()) == NULL) { @@ -254,11 +254,6 @@ diff -up openssl-1.1.1c/ssl/ssl_ciph.c.system-cipherlist openssl-1.1.1c/ssl/ssl_ /* Add TLSv1.3 ciphers first - we always prefer those if possible */ for (i = 0; i < sk_SSL_CIPHER_num(tls13_ciphersuites); i++) { if (!sk_SSL_CIPHER_push(cipherstack, - sk_SSL_CIPHER_value(tls13_ciphersuites, i))) { -+ OPENSSL_free(co_list); - sk_SSL_CIPHER_free(cipherstack); - return NULL; - } @@ -1631,6 +1691,14 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_ *cipher_list = cipherstack; diff --git a/rpm/openssl-1.1.1-version-override.patch b/rpm/openssl-1.1.1-version-override.patch index 86f102a..bb53a8b 100644 --- a/rpm/openssl-1.1.1-version-override.patch +++ b/rpm/openssl-1.1.1-version-override.patch @@ -1,13 +1,12 @@ -diff --git a/include/openssl/opensslv.h b/include/openssl/opensslv.h -index cbbfab12b3..8fda6b1b85 100644 ---- a/include/openssl/opensslv.h -+++ b/include/openssl/opensslv.h +diff -up openssl-1.1.1q/include/openssl/opensslv.h.version-override openssl-1.1.1q/include/openssl/opensslv.h +--- openssl-1.1.1q/include/openssl/opensslv.h.version-override 2022-07-07 13:14:40.123541142 +0200 ++++ openssl-1.1.1q/include/openssl/opensslv.h 2022-07-07 13:15:20.777288763 +0200 @@ -40,7 +40,7 @@ extern "C" { * major minor fix final patch/beta) */ - # define OPENSSL_VERSION_NUMBER 0x101010cfL --# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1l 24 Aug 2021" -+# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1l FIPS 24 Aug 2021" + # define OPENSSL_VERSION_NUMBER 0x1010113fL +-# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1s 1 Nov 2022" ++# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1s FIPS 1 Nov 2022" /*- * The macros below are to be used for shared library (.so, .dll, ...) diff --git a/rpm/openssl-1_1-Optimize-AES-GCM-uarchs.patch b/rpm/openssl-1_1-Optimize-AES-GCM-uarchs.patch new file mode 100644 index 0000000..b5d4fc3 --- /dev/null +++ b/rpm/openssl-1_1-Optimize-AES-GCM-uarchs.patch @@ -0,0 +1,7709 @@ +From 954f45ba4c504570206ff5bed811e512cf92dc8e Mon Sep 17 00:00:00 2001 +From: XiaokangQian +Date: Wed, 9 Jun 2021 06:35:46 +0000 +Subject: [PATCH] Optimize AES-GCM for uarchs with unroll and new instructions + +Increase the block numbers to 8 for every iteration. Increase the hash +table capacity. Make use of EOR3 instruction to improve the performance. + +This can improve performance 25-40% on out-of-order microarchitectures +with a large number of fast execution units, such as Neoverse V1. We also +see 20-30% performance improvements on other architectures such as the M1. + +Assembly code reviewd by Tom Cosgrove (ARM). + +Reviewed-by: Bernd Edlinger +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/15916) +--- + crypto/arm64cpuid.pl | 8 + + crypto/arm_arch.h | 6 + + crypto/armcap.c | 24 +- + crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl | 7369 +++++++++++++++++ + crypto/modes/asm/ghashv8-armx.pl | 105 +- + crypto/modes/build.info | 4 +- + include/crypto/aes_platform.h | 12 + + .../ciphers/cipher_aes_gcm_hw_armv8.inc | 36 +- + 8 files changed, 7546 insertions(+), 18 deletions(-) + create mode 100644 crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl + +Index: openssl-1.1.1m/crypto/arm64cpuid.pl +=================================================================== +--- openssl-1.1.1m.orig/crypto/arm64cpuid.pl ++++ openssl-1.1.1m/crypto/arm64cpuid.pl +@@ -78,6 +78,14 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_eor3_probe ++.type _armv8_eor3_probe,%function ++_armv8_eor3_probe: ++ AARCH64_VALID_CALL_TARGET ++ .long 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b ++ ret ++.size _armv8_eor3_probe,.-_armv8_eor3_probe ++ + .globl _armv8_cpuid_probe + .type _armv8_cpuid_probe,%function + _armv8_cpuid_probe: +Index: openssl-1.1.1m/crypto/arm_arch.h +=================================================================== +--- openssl-1.1.1m.orig/crypto/arm_arch.h ++++ openssl-1.1.1m/crypto/arm_arch.h +@@ -83,6 +83,9 @@ extern unsigned int OPENSSL_arm_midr; + # define ARMV8_SHA512 (1<<6) + # define ARMV8_CPUID (1<<7) + ++# define ARMV8_SHA3 (1<<11) ++# define ARMV8_UNROLL8_EOR3 (1<<12) ++ + /* + * MIDR_EL1 system register + * +@@ -97,6 +100,7 @@ extern unsigned int OPENSSL_arm_midr; + + # define ARM_CPU_PART_CORTEX_A72 0xD08 + # define ARM_CPU_PART_N1 0xD0C ++# define ARM_CPU_PART_V1 0xD40 + + # define MIDR_PARTNUM_SHIFT 4 + # define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) +@@ -125,4 +129,29 @@ extern unsigned int OPENSSL_arm_midr; + + # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) ++ ++# define IS_CPU_SUPPORT_UNROLL8_EOR3() \ ++ (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) ++ ++#if defined(__ASSEMBLER__) ++ ++ /* ++ * Support macros for ++ * - Armv8.3-A Pointer Authentication and ++ * - Armv8.5-A Branch Target Identification ++ * features which require emitting a .note.gnu.property section with the ++ * appropriate architecture-dependent feature bits set. ++ * Read more: "ELF for the Arm® 64-bit Architecture" ++ */ ++ ++# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 ++# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ ++# else ++# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET ++# endif ++ ++# endif /* defined __ASSEMBLER__ */ ++ + #endif +Index: openssl-1.1.1m/crypto/armcap.c +=================================================================== +--- openssl-1.1.1m.orig/crypto/armcap.c ++++ openssl-1.1.1m/crypto/armcap.c +@@ -13,6 +13,9 @@ + #include + #include + #include ++#ifdef __APPLE__ ++#include ++#endif + #include "internal/cryptlib.h" + + #include "arm_arch.h" +@@ -134,6 +137,7 @@ static unsigned long getauxval(unsigned + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CPUID (1 << 11) ++# define HWCAP_SHA3 (1 << 17) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -148,12 +152,15 @@ void OPENSSL_cpuid_setup(void) + return; + trigger = 1; + ++ OPENSSL_armcap_P = 0; ++ + if ((e = getenv("OPENSSL_armcap"))) { + OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0); + return; + } + +-# if defined(__APPLE__) && !defined(__aarch64__) ++# if defined(__APPLE__) ++# if !defined(__aarch64__) + /* + * Capability probing by catching SIGILL appears to be problematic + * on iOS. But since Apple universe is "monocultural", it's actually +@@ -169,9 +176,25 @@ void OPENSSL_cpuid_setup(void) + * Unified code works because it never triggers SIGILL on Apple + * devices... + */ +-# endif ++# else ++ { ++ unsigned int feature; ++ size_t len = sizeof(feature); ++ char uarch[64]; + +- OPENSSL_armcap_P = 0; ++ if (sysctlbyname("hw.optional.armv8_2_sha512", &feature, &len, NULL, 0) == 0 && feature == 1) ++ OPENSSL_armcap_P |= ARMV8_SHA512; ++ feature = 0; ++ if (sysctlbyname("hw.optional.armv8_2_sha3", &feature, &len, NULL, 0) == 0 && feature == 1) { ++ OPENSSL_armcap_P |= ARMV8_SHA3; ++ len = sizeof(uarch); ++ if ((sysctlbyname("machdep.cpu.brand_string", uarch, &len, NULL, 0) == 0) && ++ (strncmp(uarch, "Apple M1", 8) == 0)) ++ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; ++ } ++ } ++# endif ++# endif + + # ifdef OSSL_IMPLEMENT_GETAUXVAL + if (getauxval(HWCAP) & HWCAP_NEON) { +@@ -197,6 +220,9 @@ void OPENSSL_cpuid_setup(void) + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; ++ ++ if (hwcap & HWCAP_SHA3) ++ OPENSSL_armcap_P |= ARMV8_SHA3; + # endif + } + # endif +@@ -240,6 +266,10 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_eor3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SHA3; ++ } + # endif + } + # endif +@@ -262,6 +292,9 @@ void OPENSSL_cpuid_setup(void) + (OPENSSL_armcap_P & ARMV7_NEON)) { + OPENSSL_armv8_rsa_neonized = 1; + } ++ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1)) && ++ (OPENSSL_armcap_P & ARMV8_SHA3)) ++ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; + # endif + } + #endif +Index: openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl +=================================================================== +--- /dev/null ++++ openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl +@@ -0,0 +1,7369 @@ ++#! /usr/bin/env perl ++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++#======================================================================== ++# Written by Xiaokang Qian for the OpenSSL project, ++# derived from https://github.com/ARM-software/AArch64cryptolib, original ++# author Samuel Lee . The module is, however, dual ++# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you ++# obtain it. ++#======================================================================== ++# ++# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading ++# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated ++# intermediate hashesfrom the 8 blocks. ++# ++# ____________________________________________________ ++# | | ++# | PRE | ++# |____________________________________________________| ++# | | | | ++# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 | ++# |________________|____(mostly)____|__________________| ++# | | ++# | MODULO | ++# |____________________________________________________| ++# ++# PRE: ++# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 ++# EXT low_acc, low_acc, low_acc, #8 ++# EOR res_curr (8k+0), res_curr (4k+0), low_acc ++# ++# CTR block: ++# Increment and byte reverse counter in scalar registers and transfer to SIMD registers ++# REV ctr32, rev_ctr32 ++# ORR ctr64, constctr96_top32, ctr32, LSL #32 ++# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF ++# INS ctr_next.d[1], ctr64X ++# ADD rev_ctr32, #1 ++# ++# AES block: ++# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. ++# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring ++# Given we are very constrained in our ASIMD registers this is quite important ++# ++# Encrypt: ++# LDR input_low, [ input_ptr ], #8 ++# LDR input_high, [ input_ptr ], #8 ++# EOR input_low, k14_low ++# EOR input_high, k14_high ++# INS res_curr.d[0], input_low ++# INS res_curr.d[1], input_high ++# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k13 ++# EOR res_curr, res_curr, ctr_curr ++# ST1 { res_curr.16b }, [ output_ptr ], #16 ++# ++# Decrypt: ++# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k13 ++# LDR res_curr, [ input_ptr ], #16 ++# EOR res_curr, res_curr, ctr_curr ++# MOV output_low, res_curr.d[0] ++# MOV output_high, res_curr.d[1] ++# EOR output_low, k14_low ++# EOR output_high, k14_high ++# STP output_low, output_high, [ output_ptr ], #16 ++ ++# GHASH block X: ++# Do 128b karatsuba polynomial multiplication on block ++# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b ++# ++# multiplication: ++# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 ++# ++# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: ++# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 ++# ++# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are ++# multiplying with "twisted" powers of H ++# ++# Note: We can PMULL directly into the acc_x in first GHASH of the loop ++# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical ++# path latency dominates the performance ++# ++# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers ++# than indicated here ++# REV64 res_curr, res_curr ++# INS t_m.d[0], res_curr.d[1] ++# EOR t_m.8B, t_m.8B, res_curr.8B ++# PMULL2 t_h, res_curr, HX ++# PMULL t_l, res_curr, HX ++# PMULL t_m, t_m, HX_k ++# EOR acc_h, acc_h, t_h ++# EOR acc_l, acc_l, t_l ++# EOR acc_m, acc_m, t_m ++# ++# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them ++# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo ++# with a reversed constant ++# EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing ++# PMULL t_mod, acc_h, mod_constant ++# EXT acc_h, acc_h, acc_h, #8 ++# EOR3 acc_m, acc_m, t_mod, acc_h ++# PMULL acc_h, acc_m, mod_constant ++# EXT acc_m, acc_m, acc_m, #8 ++# EOR3 acc_l, acc_l, acc_m, acc_h ++ ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or ++die "can't locate arm-xlate.pl"; ++ ++die "only for 64 bit" if $flavour !~ /64/; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++$code=<<___; ++#include "arm_arch.h" ++ ++#if __ARM_MAX_ARCH__>=8 ++___ ++$code.=".arch armv8.2-a+crypto\n.arch_extension sha3\n.text\n"; ++ ++$input_ptr="x0"; #argument block ++$bit_length="x1"; ++$output_ptr="x2"; ++$current_tag="x3"; ++$counter="x16"; ++$constant_temp="x15"; ++$modulo_constant="x10"; ++$cc="x8"; ++{ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_128_kernel ++.type unroll8_eor3_aes_gcm_enc_128_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_128_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L128_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L128_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result ++ eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L128_enc_prepretail @ do prepretail ++ ++.L128_enc_main_loop: @ main loop start ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h3l | h3h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result ++ ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L128_enc_main_loop ++ ++.L128_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++.L128_enc_tail: @ TAIL ++ ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext ++ ++ mov $t1.16b, $rk10 ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ cmp $main_end_input_ptr, #112 ++ b.gt .L128_enc_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ movi $acc_h.8b, #0 ++ ++ cmp $main_end_input_ptr, #96 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_l.8b, #0 ++ movi $acc_m.8b, #0 ++ b.gt .L128_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #80 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_4 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L128_enc_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr1b ++ ++ cmp $main_end_input_ptr, #32 ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ b.gt .L128_enc_blocks_more_than_2 ++ ++ cmp $main_end_input_ptr, #16 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_1 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b .L128_enc_blocks_less_than_1 ++.L128_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++.L128_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L128_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++.L128_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++.L128_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++.L128_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++.L128_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h2q, [$current_tag, #64] @ load h2l | h2h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++.L128_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ cmp $bit_length, #64 ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L128_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel ++___ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# u64 *Xi, ++# unsigned char ivec[16], ++# const void *key); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_128_kernel ++.type unroll8_eor3_aes_gcm_dec_128_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_128_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L128_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ ++ aese $ctr0b, $rk9 @ AES block 0 - round 9 ++ aese $ctr1b, $rk9 @ AES block 1 - round 9 ++ aese $ctr6b, $rk9 @ AES block 6 - round 9 ++ ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ aese $ctr4b, $rk9 @ AES block 4 - round 9 ++ aese $ctr3b, $rk9 @ AES block 3 - round 9 ++ ++ aese $ctr2b, $rk9 @ AES block 2 - round 9 ++ aese $ctr5b, $rk9 @ AES block 5 - round 9 ++ aese $ctr7b, $rk9 @ AES block 7 - round 9 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L128_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result ++ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ ++ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result ++ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result ++ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L128_dec_prepretail @ do prepretail ++ ++.L128_dec_main_loop: @ main loop start ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result ++ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ ++ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ b.lt .L128_dec_main_loop ++ ++.L128_dec_prepretail: @ PREPRETAIL ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ ++.L128_dec_tail: @ TAIL ++ ++ mov $t1.16b, $rk10 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ cmp $main_end_input_ptr, #112 ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ b.gt .L128_dec_blocks_more_than_7 ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr7b, $ctr6b ++ movi $acc_l.8b, #0 ++ ++ movi $acc_h.8b, #0 ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_m.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L128_dec_blocks_more_than_6 ++ ++ cmp $main_end_input_ptr, #80 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L128_dec_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L128_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L128_dec_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #32 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ mov $ctr6b, $ctr1b ++ b.gt .L128_dec_blocks_more_than_2 ++ ++ cmp $main_end_input_ptr, #16 ++ ++ mov $ctr7b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt L128_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L128_dec_blocks_less_than_1 ++.L128_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++.L128_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++.L128_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++.L128_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++.L128_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L128_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++.L128_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L128_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ lsr x0, $bit_length, #3 ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++.L128_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel ++___ ++} ++ ++{ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_192_kernel ++.type unroll8_eor3_aes_gcm_enc_192_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_192_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L192_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10 ++ ++ aese $ctr6b, $rk11 @ AES block 14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 11 - round 11 ++ ++ aese $ctr4b, $rk11 @ AES block 12 - round 11 ++ aese $ctr7b, $rk11 @ AES block 15 - round 11 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr1b, $rk11 @ AES block 9 - round 11 ++ aese $ctr5b, $rk11 @ AES block 13 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 10 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8 - round 11 ++ b.ge .L192_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ ++ b.ge .L192_enc_prepretail @ do prepretail ++ ++.L192_enc_main_loop: @ main loop start ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L192_enc_main_loop ++ ++.L192_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ ++.L192_enc_tail: @ TAIL ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ mov $t1.16b, $rk12 ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ cmp $main_end_input_ptr, #112 ++ ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ b.gt .L192_enc_blocks_more_than_7 ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ ++ mov $ctr6b, $ctr5b ++ movi $acc_l.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ ++ mov $ctr2b, $ctr1b ++ movi $acc_m.8b, #0 ++ b.gt .L192_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #80 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ mov $ctr3b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L192_enc_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr1b ++ b.gt .L192_enc_blocks_more_than_4 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #48 ++ b.gt .L192_enc_blocks_more_than_3 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ cmp $main_end_input_ptr, #32 ++ b.gt .L192_enc_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ cmp $main_end_input_ptr, #16 ++ mov $ctr7b, $ctr1b ++ b.gt .L192_enc_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L192_enc_blocks_less_than_1 ++.L192_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++.L192_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++.L192_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++.L192_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++.L192_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++.L192_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++.L192_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++.L192_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L192_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel ++___ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_192_kernel ++.type unroll8_eor3_aes_gcm_dec_192_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_192_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L192_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ld1 { $acc_lb}, [$current_tag] ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr0b, $rk11 @ AES block 0 - round 11 ++ aese $ctr1b, $rk11 @ AES block 1 - round 11 ++ aese $ctr4b, $rk11 @ AES block 4 - round 11 ++ ++ aese $ctr6b, $rk11 @ AES block 6 - round 11 ++ aese $ctr5b, $rk11 @ AES block 5 - round 11 ++ aese $ctr7b, $rk11 @ AES block 7 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 2 - round 11 ++ aese $ctr3b, $rk11 @ AES block 3 - round 11 ++ b.ge .L192_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ b.ge .L192_dec_prepretail @ do prepretail ++ ++.L192_dec_main_loop: @ main loop start ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res0b, $res0b @ GHASH block 8k ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ b.lt .L192_dec_main_loop ++ ++.L192_dec_prepretail: @ PREPRETAIL ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ ++.L192_dec_tail: @ TAIL ++ ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ mov $t1.16b, $rk12 ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ cmp $main_end_input_ptr, #112 ++ b.gt .L192_dec_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ cmp $main_end_input_ptr, #96 ++ movi $acc_l.8b, #0 ++ mov $ctr3b, $ctr2b ++ ++ mov $ctr2b, $ctr1b ++ movi $acc_m.8b, #0 ++ b.gt .L192_dec_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #80 ++ b.gt .L192_dec_blocks_more_than_5 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ cmp $main_end_input_ptr, #64 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L192_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L192_dec_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #32 ++ ++ mov $ctr6b, $ctr1b ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ b.gt .L192_dec_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr1b ++ cmp $main_end_input_ptr, #16 ++ b.gt .L192_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L192_dec_blocks_less_than_1 ++.L192_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++.L192_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++.L192_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++.L192_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++.L192_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L192_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++.L192_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++.L192_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ mov $ctr0.d[1], $temp3_x ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L192_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel ++___ ++} ++ ++{ ++ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_256_kernel ++.type unroll8_eor3_aes_gcm_enc_256_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_256_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L256_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11 ++ ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 ++ ++ aese $ctr2b, $rk13 @ AES block 2 - round 13 ++ aese $ctr1b, $rk13 @ AES block 1 - round 13 ++ aese $ctr4b, $rk13 @ AES block 4 - round 13 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12 ++ ++ aese $ctr0b, $rk13 @ AES block 0 - round 13 ++ aese $ctr5b, $rk13 @ AES block 5 - round 13 ++ ++ aese $ctr6b, $rk13 @ AES block 6 - round 13 ++ aese $ctr7b, $rk13 @ AES block 7 - round 13 ++ aese $ctr3b, $rk13 @ AES block 3 - round 13 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L256_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L256_enc_prepretail @ do prepretail ++ ++.L256_enc_main_loop: @ main loop start ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result ++ ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ b.lt .L256_enc_main_loop ++ ++.L256_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++.L256_enc_tail: @ TAIL ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ mov $t1.16b, $rk14 ++ ++ cmp $main_end_input_ptr, #112 ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ b.gt .L256_enc_blocks_more_than_7 ++ ++ movi $acc_l.8b, #0 ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ mov $ctr3b, $ctr2b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_m.8b, #0 ++ cmp $main_end_input_ptr, #96 ++ b.gt .L256_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ cmp $main_end_input_ptr, #80 ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_5 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ cmp $main_end_input_ptr, #64 ++ mov $ctr4b, $ctr1b ++ b.gt .L256_enc_blocks_more_than_4 ++ ++ cmp $main_end_input_ptr, #48 ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_3 ++ ++ cmp $main_end_input_ptr, #32 ++ mov $ctr7b, $ctr6b ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ mov $ctr6b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_2 ++ ++ mov $ctr7b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #16 ++ b.gt .L256_enc_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L256_enc_blocks_less_than_1 ++.L256_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++.L256_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L256_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++.L256_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++.L256_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++.L256_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++.L256_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L256_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp3_x, $temp0_x, xzr, lt ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mov $ctr0.d[1], $temp3_x ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L256_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel ++___ ++ ++{ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_256_kernel ++.type unroll8_eor3_aes_gcm_dec_256_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_256_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L256_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11 ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 ++ ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12 ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12 ++ ++ aese $ctr5b, $rk13 @ AES block 5 - round 13 ++ aese $ctr1b, $rk13 @ AES block 1 - round 13 ++ aese $ctr2b, $rk13 @ AES block 2 - round 13 ++ ++ aese $ctr0b, $rk13 @ AES block 0 - round 13 ++ aese $ctr4b, $rk13 @ AES block 4 - round 13 ++ aese $ctr6b, $rk13 @ AES block 6 - round 13 ++ ++ aese $ctr3b, $rk13 @ AES block 3 - round 13 ++ aese $ctr7b, $rk13 @ AES block 7 - round 13 ++ b.ge .L256_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L256_dec_prepretail @ do prepretail ++ ++.L256_dec_main_loop: @ main loop start ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result ++ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result ++ ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L256_dec_main_loop ++ ++.L256_dec_prepretail: @ PREPRETAIL ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++.L256_dec_tail: @ TAIL ++ ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ cmp $main_end_input_ptr, #112 ++ ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ mov $t1.16b, $rk14 ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ b.gt .L256_dec_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ movi $acc_l.8b, #0 ++ ++ movi $acc_h.8b, #0 ++ movi $acc_m.8b, #0 ++ mov $ctr3b, $ctr2b ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr2b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ cmp $main_end_input_ptr, #80 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #48 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_3 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ ++ cmp $main_end_input_ptr, #32 ++ mov $ctr6b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr1b ++ cmp $main_end_input_ptr, #16 ++ b.gt .L256_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L256_dec_blocks_less_than_1 ++.L256_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++.L256_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L256_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++.L256_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++.L256_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L256_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++.L256_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L256_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp3_x, $temp0_x, xzr, lt ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ mov $ctr0.d[1], $temp3_x ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid ++ eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low ++ ++ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L256_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel ++___ ++} ++} ++ ++$code.=<<___; ++.asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by " ++.align 2 ++#endif ++___ ++ ++{ ++ my %opcode = ( ++ "rax1" => 0xce608c00, "eor3" => 0xce000000, ++ "bcax" => 0xce200000, "xar" => 0xce800000 ); ++ ++ sub unsha3 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), ++ $mnemonic,$arg; ++ } ++ sub unvmov { ++ my $arg=shift; ++ ++ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && ++ sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, ++ $3<8?$3:$3+8,($4 eq "lo")?0:1; ++ } ++ ++ foreach(split("\n",$code)) { ++ s/@\s/\/\//o; # old->new style commentary ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ m/\bld1r\b/ and s/\.16b/.2d/g or ++ s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; ++ print $_,"\n"; ++ } ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; # enforce flush +Index: openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl +=================================================================== +--- openssl-1.1.1m.orig/crypto/modes/asm/ghashv8-armx.pl ++++ openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl +@@ -141,6 +141,7 @@ gcm_init_v8: + ___ + if ($flavour =~ /64/) { + my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); ++my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23)); + + $code.=<<___; + @ calculate H^3 and H^4 +@@ -175,15 +176,103 @@ $code.=<<___; + vpmull.p64 $Yl,$Yl,$xC2 + veor $t2,$t2,$Xh + veor $t3,$t3,$Yh +- veor $H, $Xl,$t2 @ H^3 +- veor $H2,$Yl,$t3 @ H^4 ++ veor $H3, $Xl,$t2 @ H^3 ++ veor $H4,$Yl,$t3 @ H^4 + +- vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing +- vext.8 $t1,$H2,$H2,#8 +- veor $t0,$t0,$H +- veor $t1,$t1,$H2 +- vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed +- vst1.64 {$H-$H2},[x0] @ store Htable[3..5] ++ vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H4,$H4,#8 ++ vext.8 $t2,$H2,$H2,#8 ++ veor $t0,$t0,$H3 ++ veor $t1,$t1,$H4 ++ veor $t2,$t2,$H2 ++ vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5] ++ ++ @ calculate H^5 and H^6 ++ vpmull.p64 $Xl,$H2, $H3 ++ vpmull.p64 $Yl,$H3,$H3 ++ vpmull2.p64 $Xh,$H2, $H3 ++ vpmull2.p64 $Yh,$H3,$H3 ++ vpmull.p64 $Xm,$t0,$t2 ++ vpmull.p64 $Ym,$t0,$t0 ++ ++ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing ++ vext.8 $t1,$Yl,$Yh,#8 ++ veor $t2,$Xl,$Xh ++ veor $Xm,$Xm,$t0 ++ veor $t3,$Yl,$Yh ++ veor $Ym,$Ym,$t1 ++ veor $Xm,$Xm,$t2 ++ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase ++ veor $Ym,$Ym,$t3 ++ vpmull.p64 $t3,$Yl,$xC2 ++ ++ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result ++ vmov $Yh#lo,$Ym#hi ++ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl ++ vmov $Ym#hi,$Yl#lo ++ veor $Xl,$Xm,$t2 ++ veor $Yl,$Ym,$t3 ++ ++ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase ++ vext.8 $t3,$Yl,$Yl,#8 ++ vpmull.p64 $Xl,$Xl,$xC2 ++ vpmull.p64 $Yl,$Yl,$xC2 ++ veor $t2,$t2,$Xh ++ veor $t3,$t3,$Yh ++ veor $H5,$Xl,$t2 @ H^5 ++ veor $H6,$Yl,$t3 @ H^6 ++ ++ vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H6,$H6,#8 ++ vext.8 $t2,$H2,$H2,#8 ++ veor $t0,$t0,$H5 ++ veor $t1,$t1,$H6 ++ veor $t2,$t2,$H2 ++ vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8] ++ ++ @ calculate H^7 and H^8 ++ vpmull.p64 $Xl,$H2,$H5 ++ vpmull.p64 $Yl,$H2,$H6 ++ vpmull2.p64 $Xh,$H2,$H5 ++ vpmull2.p64 $Yh,$H2,$H6 ++ vpmull.p64 $Xm,$t0,$t2 ++ vpmull.p64 $Ym,$t1,$t2 ++ ++ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing ++ vext.8 $t1,$Yl,$Yh,#8 ++ veor $t2,$Xl,$Xh ++ veor $Xm,$Xm,$t0 ++ veor $t3,$Yl,$Yh ++ veor $Ym,$Ym,$t1 ++ veor $Xm,$Xm,$t2 ++ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase ++ veor $Ym,$Ym,$t3 ++ vpmull.p64 $t3,$Yl,$xC2 ++ ++ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result ++ vmov $Yh#lo,$Ym#hi ++ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl ++ vmov $Ym#hi,$Yl#lo ++ veor $Xl,$Xm,$t2 ++ veor $Yl,$Ym,$t3 ++ ++ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase ++ vext.8 $t3,$Yl,$Yl,#8 ++ vpmull.p64 $Xl,$Xl,$xC2 ++ vpmull.p64 $Yl,$Yl,$xC2 ++ veor $t2,$t2,$Xh ++ veor $t3,$t3,$Yh ++ veor $H7,$Xl,$t2 @ H^7 ++ veor $H8,$Yl,$t3 @ H^8 ++ ++ vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H8,$H8,#8 ++ veor $t0,$t0,$H7 ++ veor $t1,$t1,$H8 ++ vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H7-$H8},[x0] @ store Htable[9..11] + ___ + } + $code.=<<___; +Index: openssl-1.1.1m/crypto/modes/build.info +=================================================================== +--- openssl-1.1.1m.orig/crypto/modes/build.info ++++ openssl-1.1.1m/crypto/modes/build.info +@@ -20,6 +20,8 @@ GENERATE[ghash-armv4.S]=asm/ghash-armv4. + INCLUDE[ghash-armv4.o]=.. + GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME) + INCLUDE[ghashv8-armx.o]=.. ++GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl $(PERLASM_SCHEME) ++INCLUDE[aes-gcm-armv8-unroll8_64.o]=.. + GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl $(PERLASM_SCHEME) + INCLUDE[ghash-s390x.o]=.. + diff --git a/rpm/openssl-1_1-Optimize-AES-XTS-aarch64.patch b/rpm/openssl-1_1-Optimize-AES-XTS-aarch64.patch new file mode 100644 index 0000000..2630902 --- /dev/null +++ b/rpm/openssl-1_1-Optimize-AES-XTS-aarch64.patch @@ -0,0 +1,1616 @@ +From 9ce8e0d17e608de4f85f7543c52b146e3c6a2291 Mon Sep 17 00:00:00 2001 +From: XiaokangQian +Date: Fri, 13 Mar 2020 03:27:34 +0000 +Subject: [PATCH] Optimize AES-XTS mode in OpenSSL for aarch64 + +Aes-xts mode can be optimized by interleaving cipher operation on +several blocks and loop unrolling. Interleaving needs one ideal +unrolling factor, here we adopt the same factor with aes-cbc, +which is described as below: + If blocks number > 5, select 5 blocks as one iteration,every + loop, decrease the blocks number by 5. + If left blocks < 5, treat them as tail blocks. +Detailed implementation has a little adjustment for squeezing +code space. +With this way, for small size such as 16 bytes, the performance is +similar as before, but for big size such as 16k bytes, the performance +improves a lot, even reaches to 2x uplift, for some arches such as A57, +the improvement even reaches more than 2x uplift. We collect many +performance datas on different micro-archs such as thunderx2, +ampere-emag, a72, a75, a57, a53 and N1, all of which reach 0.5-2x uplift. +The following table lists the encryption performance data on aarch64, +take a72, a75, a57, a53 and N1 as examples. Performance value takes the +unit of cycles per byte, takes the format as comparision of values. +List them as below: + +A72: + Before optimization After optimization Improve +evp-aes-128-xts@16 8.899913518 5.949087263 49.60% +evp-aes-128-xts@64 4.525512668 3.389141845 33.53% +evp-aes-128-xts@256 3.502906908 1.633573479 114.43% +evp-aes-128-xts@1024 3.174210419 1.155952639 174.60% +evp-aes-128-xts@8192 3.053019303 1.028134888 196.95% +evp-aes-128-xts@16384 3.025292462 1.02021169 196.54% +evp-aes-256-xts@16 9.971105023 6.754233758 47.63% +evp-aes-256-xts@64 4.931479093 3.786527393 30.24% +evp-aes-256-xts@256 3.746788153 1.943975947 92.74% +evp-aes-256-xts@1024 3.401743802 1.477394648 130.25% +evp-aes-256-xts@8192 3.278769327 1.32950421 146.62% +evp-aes-256-xts@16384 3.27093296 1.325276257 146.81% + +A75: + Before optimization After optimization Improve +evp-aes-128-xts@16 8.397965173 5.126839098 63.80% +evp-aes-128-xts@64 4.176860631 2.59817764 60.76% +evp-aes-128-xts@256 3.069126585 1.284561028 138.92% +evp-aes-128-xts@1024 2.805962699 0.932754655 200.83% +evp-aes-128-xts@8192 2.725820131 0.829820397 228.48% +evp-aes-128-xts@16384 2.71521905 0.823251591 229.82% +evp-aes-256-xts@16 11.24790935 7.383914448 52.33% +evp-aes-256-xts@64 5.294128847 3.048641998 73.66% +evp-aes-256-xts@256 3.861649617 1.570359905 145.91% +evp-aes-256-xts@1024 3.537646797 1.200493533 194.68% +evp-aes-256-xts@8192 3.435353012 1.085345319 216.52% +evp-aes-256-xts@16384 3.437952563 1.097963822 213.12% + +A57: + Before optimization After optimization Improve +evp-aes-128-xts@16 10.57455446 7.165438012 47.58% +evp-aes-128-xts@64 5.418185447 3.721241202 45.60% +evp-aes-128-xts@256 3.855184592 1.747145379 120.66% +evp-aes-128-xts@1024 3.477199757 1.253049735 177.50% +evp-aes-128-xts@8192 3.36768104 1.091943159 208.41% +evp-aes-128-xts@16384 3.360373443 1.088942789 208.59% +evp-aes-256-xts@16 12.54559459 8.745489036 43.45% +evp-aes-256-xts@64 6.542808937 4.326387568 51.23% +evp-aes-256-xts@256 4.62668822 2.119908754 118.25% +evp-aes-256-xts@1024 4.161716505 1.557335554 167.23% +evp-aes-256-xts@8192 4.032462227 1.377749511 192.68% +evp-aes-256-xts@16384 4.023293877 1.371558933 193.34% + +A53: + Before optimization After optimization Improve +evp-aes-128-xts@16 18.07842135 13.96980808 29.40% +evp-aes-128-xts@64 7.933818397 6.07159276 30.70% +evp-aes-128-xts@256 5.264604704 2.611155744 101.60% +evp-aes-128-xts@1024 4.606660117 1.722713454 167.40% +evp-aes-128-xts@8192 4.405160115 1.454379201 202.90% +evp-aes-128-xts@16384 4.401592028 1.442279392 205.20% +evp-aes-256-xts@16 20.07084054 16.00803726 25.40% +evp-aes-256-xts@64 9.192647294 6.883876732 33.50% +evp-aes-256-xts@256 6.336143161 3.108140452 103.90% +evp-aes-256-xts@1024 5.62502952 2.097960651 168.10% +evp-aes-256-xts@8192 5.412085608 1.807294191 199.50% +evp-aes-256-xts@16384 5.403062591 1.790135764 201.80% + +N1: + Before optimization After optimization Improve +evp-aes-128-xts@16 6.48147613 4.209415473 53.98% +evp-aes-128-xts@64 2.847744115 1.950757468 45.98% +evp-aes-128-xts@256 2.085711968 1.061903238 96.41% +evp-aes-128-xts@1024 1.842014669 0.798486302 130.69% +evp-aes-128-xts@8192 1.760449052 0.713853939 146.61% +evp-aes-128-xts@16384 1.760763546 0.707702009 148.80% +evp-aes-256-xts@16 7.264142817 5.265970454 37.94% +evp-aes-256-xts@64 3.251356212 2.41176323 34.81% +evp-aes-256-xts@256 2.380488469 1.342095742 77.37% +evp-aes-256-xts@1024 2.08853022 1.041718215 100.49% +evp-aes-256-xts@8192 2.027432668 0.944571334 114.64% +evp-aes-256-xts@16384 2.00740782 0.941991415 113.10% + +Add more XTS test cases to cover the cipher stealing mode and cases of different +number of blocks. + +CustomizedGitHooks: yes +Change-Id: I93ee31b2575e1413764e27b599af62994deb4c96 + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/11399) +--- + crypto/aes/asm/aesv8-armx.pl | 1426 +++++++++++++++++ + include/crypto/aes_platform.h | 4 + + .../30-test_evp_data/evpciph_aes_common.txt | 38 + + 3 files changed, 1468 insertions(+) + +Index: openssl-1.1.1d/crypto/aes/asm/aesv8-armx.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/aes/asm/aesv8-armx.pl ++++ openssl-1.1.1d/crypto/aes/asm/aesv8-armx.pl +@@ -897,6 +897,1432 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++# Performance in cycles per byte. ++# Processed with AES-XTS different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-XTS AES-256-XTS ++# Cortex-A57 3.36/1.09 4.02/1.37 ++# Cortex-A72 3.03/1.02 3.28/1.33 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes ++# will be processed specially, which be integrated into the 5*16 bytes ++# loop to improve the efficiency. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store. ++# Encryption will process the (length -tailcnt) bytes as mentioned ++# previously, then encrypt the composite block as last second ++# cipher block. ++# Decryption will process the (length -tailcnt -1) bytes as mentioned ++# previously, then decrypt the last second cipher block to get the ++# last plain block(tail), decrypt the composite block as last second ++# plain text block. ++ ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($tmpin)=("v26.16b"); ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_encrypt ++.type ${prefix}_xts_encrypt,%function ++.align 5 ++${prefix}_xts_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_enc_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_enc_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_enc_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aese $dat0,q20 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aese $dat0,q21 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing ++ b.eq .Lxts_128_enc ++.Lxts_enc_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_enc_round_loop ++.Lxts_128_enc: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$dat0,$iv0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_enc_final_abort ++ ++.align 4 ++.Lxts_enc_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ // tailcnt store the tail value of length%16. ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_abort ++ csel $step,xzr,$step,eq ++ ++ // Firstly, encrypt the iv with key2, as the first iv of XEX. ++ ldr $rounds,[$key2,#240] ++ vld1.32 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key2],#16 ++ ++.Loop_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // next starting point ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ ++ // Encryption ++.Lxts_enc: ++ vld1.8 {$dat2},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_enc_tail ++ veor $dat,$dat,$iv0 // before encryption, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // the third block ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_enc_tail ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ vld1.8 {$dat3},[$inp],#16 ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_enc ++ ++.align 4 ++.Loop5x_xts_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_xts_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aese $dat0,q15 ++ // The iv for first block of one iteration ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_enc ++ ++ ++ // If left 4 blocks, borrow the five block's processing. ++ cmn $len,#0x10 ++ b.ne .Loop5x_enc_after ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_enc ++ ++.Loop5x_enc_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_enc_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_enc_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_enc_tail ++ ++.align 4 ++.Lxts_enc_tail4x: ++ add $inp,$inp,#16 ++ veor $tmp1,$dat1,$tmp1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_enc_done ++.align 4 ++.Lxts_outer_enc_tail: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_enc_tail ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ //mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset ++ mov $key_,$key1 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ add $rounds,$rounds0,#2 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ cmn $len,#0x30 ++ b.eq .Lxts_enc_done ++.Lxts_encxor_one: ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_enc_tail: ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_enc_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_enc_tail_loop: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_enc_tail_loop ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lxts_enc_one ++ veor $tmp1,$tmp1,$dat1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vst1.8 {$tmp2},[$out],#16 ++ fmov $ivl,$ivd10 ++ fmov $ivh,$ivd11 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++ ++.Lxts_enc_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv0,$iv0 ++ vst1.8 {$tmp1},[$out],#16 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++.align 5 ++.Lxts_enc_done: ++ // Process the tail block with cipher stealing. ++ tst $tailcnt,#0xf ++ b.eq .Lxts_abort ++ ++ mov $tmpinp,$inp ++ mov $tmpoutp,$out ++ sub $out,$out,#16 ++.composite_enc_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_enc_loop ++.Lxts_enc_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Encrypt the composite block to get the last second encrypted text block ++ ldr $rounds,[$key1,#240] // load key schedule... ++ vld1.8 {$dat},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key1],#16 // load key schedule... ++.Loop_final_enc: ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 ++ subs $rounds,$rounds,#2 ++ aese $tmpin,$dat1 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 ++ b.gt .Loop_final_enc ++ ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aese $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++.Lxts_enc_final_abort: ++ ret ++.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ++___ ++ ++}}} ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_decrypt ++.type ${prefix}_xts_decrypt,%function ++.align 5 ++${prefix}_xts_decrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_dec_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_small_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_small_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aesd $dat0,q20 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aesd $dat0,q21 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lxts_128_dec ++.Lxts_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_dec_round_loop ++.Lxts_128_dec: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$iv0,$dat0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_dec_final_abort ++.Lxts_dec_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_dec_abort ++ ++ // Encrypt the iv with key2, as the first XEX iv ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // load rounds number ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 // load key schedule... ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ b .Lxts_dec ++ ++ // Decryption ++.align 5 ++.Lxts_dec: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_begin ++ subs $len,$len,#16 ++ csel $step,xzr,$step,eq ++ vld1.8 {$dat},[$inp],#16 ++ b.lo .Lxts_done ++ sub $inp,$inp,#16 ++.Lxts_dec_begin: ++ vld1.8 {$dat},[$inp],$step ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_dec_tail ++ veor $dat,$dat,$iv0 // before decryt, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // third block xox with third iv ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_dec_tail ++ ++ vld1.8 {$dat3},[$inp],#16 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_dec ++ ++.align 4 ++.Loop5x_xts_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // load key schedule... ++ b.gt .Loop5x_xts_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aesd $dat0,q15 ++ // The iv for first block of next iteration. ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_dec_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_dec ++ ++ cmn $len,#0x10 ++ b.ne .Loop5x_dec_after ++ // If x2($len) equal to -0x10, the left blocks is 4. ++ // After specially processing, utilize the five blocks processing again. ++ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_dec ++ ++.Loop5x_dec_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_dec_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_dec_tail ++ ++.align 4 ++.Lxts_dec_tail4x: ++ add $inp,$inp,#16 ++ vld1.32 {$dat0},[$inp],#16 ++ veor $tmp1,$dat1,$tmp0 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_done ++.align 4 ++.Lxts_outer_dec_tail: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_dec_tail ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset // $inp is adjusted to the last data ++ ++ mov $key_,$key1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $rounds,$rounds0,#2 ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ ++ cmn $len,#0x30 ++ add $len,$len,#0x30 ++ b.eq .Lxts_done ++ sub $len,$len,#0x30 ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_dec_tail: ++ // $len == -0x10 means two blocks left. ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_dec_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_dec_tail_loop: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_dec_tail_loop ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lxts_dec_one ++ veor $tmp1,$tmp1,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv2,$iv2 ++ vorr $iv1,$iv3,$iv3 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ add $len,$len,#16 ++ b .Lxts_done ++ ++.Lxts_dec_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vorr $iv1,$iv2,$iv2 ++ vst1.8 {$tmp1},[$out],#16 ++ add $len,$len,#32 ++ ++.Lxts_done: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_abort ++ // Processing the last two blocks with cipher stealing. ++ mov x7,x3 ++ cbnz x2,.Lxts_dec_1st_done ++ vld1.32 {$dat0},[$inp],#16 ++ ++ // Decrypt the last secod block to get the last plain text block ++.Lxts_dec_1st_done: ++ eor $tmpin,$dat0,$iv1 ++ ldr $rounds,[$key1,#240] ++ vld1.32 {$dat0},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key1],#16 ++.Loop_final_2nd_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 // load key schedule... ++ b.gt .Loop_final_2nd_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv1 ++ vst1.8 {$tmpin},[$out] ++ ++ mov $tmpinp,$inp ++ add $tmpoutp,$out,#16 ++ ++ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks ++ // to get the last encrypted block. ++.composite_dec_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_dec_loop ++.Lxts_dec_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Decrypt the composite block to get the last second plain text block ++ ldr $rounds,[$key_,#240] ++ vld1.8 {$dat},[$key_],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key_],#16 ++.Loop_final_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key_],#16 // load key schedule... ++ b.gt .Loop_final_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_dec_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++ ++.Lxts_dec_final_abort: ++ ret ++.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ++___ ++} ++}}} + $code.=<<___; + #endif + ___ +Index: openssl-1.1.1d/crypto/evp/e_aes.c +=================================================================== +--- openssl-1.1.1d.orig/crypto/evp/e_aes.c ++++ openssl-1.1.1d/crypto/evp/e_aes.c +@@ -170,6 +170,10 @@ static void ctr64_inc(unsigned char *cou + # define HWAES_set_decrypt_key aes_p8_set_decrypt_key + # define HWAES_encrypt aes_p8_encrypt + # define HWAES_decrypt aes_p8_decrypt ++# if __ARM_MAX_ARCH__>=8 ++# define HWAES_xts_encrypt aes_v8_xts_encrypt ++# define HWAES_xts_decrypt aes_v8_xts_decrypt ++# endif + # define HWAES_cbc_encrypt aes_p8_cbc_encrypt + # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks + # define HWAES_xts_encrypt aes_p8_xts_encrypt +Index: openssl-1.1.1d/test/recipes/30-test_evp_data/evpcase.txt +=================================================================== +--- openssl-1.1.1d.orig/test/recipes/30-test_evp_data/evpcase.txt ++++ openssl-1.1.1d/test/recipes/30-test_evp_data/evpcase.txt +@@ -15,6 +15,44 @@ + # These tests exercise the case insensitive handling of object names. + # They are contrived + ++Title = AES XTS Non standard test vectors - generated from reference implementation ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b9dc31efeb418c373ce073b66755529982538 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39237a709959673bd8747d58690f8c762a353ad6 ++ ++Cipher = aes-128-xts ++Key = 2718281828459045235360287471352631415926535897932384626433832795 ++IV = 00000000000000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f ++Ciphertext = 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89cc78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f40 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccde9ffb48286ec211619e02decc7ca0883c6 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccdc6bc657cb3aeb87ba2c5f58ffafacd76d0a098b687c0b6536d560ca007051b0b ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f5051 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccdc6bc657cb3aeb87ba2c5f58ffafacd765ecc4c85c0a01bf317b823fbd6111956d0a0 ++ + Title = Case insensitive AES tests + + Cipher = Aes-128-eCb diff --git a/rpm/openssl-1_1-Optimize-RSA-armv8.patch b/rpm/openssl-1_1-Optimize-RSA-armv8.patch new file mode 100644 index 0000000..4aeeba0 --- /dev/null +++ b/rpm/openssl-1_1-Optimize-RSA-armv8.patch @@ -0,0 +1,575 @@ +From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001 +From: "Fangming.Fang" +Date: Tue, 28 Apr 2020 02:33:50 +0000 +Subject: [PATCH] Read MIDR_EL1 system register on aarch64 + +MIDR_EL1 system register exposes microarchitecture information so that +people can make micro-arch related optimization such as exposing as +much instruction level parallelism as possible. + +MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported. + +Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/11744) +--- + crypto/arm64cpuid.pl | 7 +++++++ + crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ + crypto/armcap.c | 11 +++++++++++ + 3 files changed, 62 insertions(+) + +Index: openssl-1.1.1d/crypto/arm64cpuid.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/arm64cpuid.pl ++++ openssl-1.1.1d/crypto/arm64cpuid.pl +@@ -78,6 +78,13 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_cpuid_probe ++.type _armv8_cpuid_probe,%function ++_armv8_cpuid_probe: ++ mrs x0, midr_el1 ++ ret ++.size _armv8_cpuid_probe,.-_armv8_cpuid_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +Index: openssl-1.1.1d/crypto/arm_arch.h +=================================================================== +--- openssl-1.1.1d.orig/crypto/arm_arch.h ++++ openssl-1.1.1d/crypto/arm_arch.h +@@ -71,6 +71,7 @@ + + # ifndef __ASSEMBLER__ + extern unsigned int OPENSSL_armcap_P; ++extern unsigned int OPENSSL_arm_midr; + # endif + + # define ARMV7_NEON (1<<0) +@@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_SHA256 (1<<4) + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) ++# define ARMV8_CPUID (1<<7) + ++/* ++ * MIDR_EL1 system register ++ * ++ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 ++ * | | | | | | | ++ * |RES0 | Implementer | Variant | Arch | PartNum |Revision| ++ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| ++ * ++ */ ++ ++# define ARM_CPU_IMP_ARM 0x41 ++ ++# define ARM_CPU_PART_CORTEX_A72 0xD08 ++# define ARM_CPU_PART_N1 0xD0C ++ ++# define MIDR_PARTNUM_SHIFT 4 ++# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) ++# define MIDR_PARTNUM(midr) \ ++ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) ++ ++# define MIDR_IMPLEMENTER_SHIFT 24 ++# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT) ++# define MIDR_IMPLEMENTER(midr) \ ++ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) ++ ++# define MIDR_ARCHITECTURE_SHIFT 16 ++# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT) ++# define MIDR_ARCHITECTURE(midr) \ ++ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) ++ ++# define MIDR_CPU_MODEL_MASK \ ++ (MIDR_IMPLEMENTER_MASK | \ ++ MIDR_PARTNUM_MASK | \ ++ MIDR_ARCHITECTURE_MASK) ++ ++# define MIDR_CPU_MODEL(imp, partnum) \ ++ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ ++ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ++ ((partnum) << MIDR_PARTNUM_SHIFT)) ++ ++# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ ++ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + #endif +Index: openssl-1.1.1d/crypto/armcap.c +=================================================================== +--- openssl-1.1.1d.orig/crypto/armcap.c ++++ openssl-1.1.1d/crypto/armcap.c +@@ -18,6 +18,8 @@ + #include "arm_arch.h" + + unsigned int OPENSSL_armcap_P = 0; ++unsigned int OPENSSL_arm_midr = 0; ++unsigned int OPENSSL_armv8_rsa_neonized = 0; + + #if __ARM_MAX_ARCH__<7 + void OPENSSL_cpuid_setup(void) +@@ -48,6 +50,7 @@ void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ + void _armv8_sha512_probe(void); ++unsigned int _armv8_cpuid_probe(void); + # endif + uint32_t _armv7_tick(void); + +@@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu + # define HWCAP_CE_PMULL (1 << 4) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) ++# define HWCAP_CPUID (1 << 11) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void) + # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; ++ ++ if (hwcap & HWCAP_CPUID) ++ OPENSSL_armcap_P |= ARMV8_CPUID; + # endif + } + # endif +@@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void) + + sigaction(SIGILL, &ill_oact, NULL); + sigprocmask(SIG_SETMASK, &oset, NULL); ++ ++# ifdef __aarch64__ ++ if (OPENSSL_armcap_P & ARMV8_CPUID) ++ OPENSSL_arm_midr = _armv8_cpuid_probe(); ++ ++ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) || ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) && ++ (OPENSSL_armcap_P & ARMV7_NEON)) { ++ OPENSSL_armv8_rsa_neonized = 1; ++ } ++# endif + } + #endif +Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl ++++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl +@@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0, + $num="x5"; # int num); + + $code.=<<___; ++#ifndef __KERNEL__ ++# include "arm_arch.h" ++.extern OPENSSL_armv8_rsa_neonized ++.hidden OPENSSL_armv8_rsa_neonized ++#endif + .text + + .globl bn_mul_mont + .type bn_mul_mont,%function + .align 5 + bn_mul_mont: ++.Lbn_mul_mont: ++ tst $num,#3 ++ b.ne .Lmul_mont ++ cmp $num,#32 ++ b.le .Lscalar_impl ++#ifndef __KERNEL__ ++ adrp x17,OPENSSL_armv8_rsa_neonized ++ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] ++ cbnz w17, bn_mul8x_mont_neon ++#endif ++ ++.Lscalar_impl: + tst $num,#7 + b.eq __bn_sqr8x_mont + tst $num,#3 + b.eq __bn_mul4x_mont ++ + .Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 +@@ -271,6 +289,369 @@ bn_mul_mont: + .size bn_mul_mont,.-bn_mul_mont + ___ + { ++my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); ++my ($Z,$Temp)=("v4.16b","v5"); ++my @ACC=map("v$_",(6..13)); ++my ($Bi,$Ni,$M0)=map("v$_",(28..30)); ++my $sBi="s28"; ++my $sM0="s30"; ++my $zero="v14"; ++my $temp="v15"; ++my $ACCTemp="v16"; ++ ++my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); ++my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); ++ ++$code.=<<___; ++.type bn_mul8x_mont_neon,%function ++.align 5 ++bn_mul8x_mont_neon: ++ stp x29,x30,[sp,#-80]! ++ mov x16,sp ++ stp d8,d9,[sp,#16] ++ stp d10,d11,[sp,#32] ++ stp d12,d13,[sp,#48] ++ stp d14,d15,[sp,#64] ++ lsl $num,$num,#1 ++ eor $zero.16b,$zero.16b,$zero.16b ++ ++.align 4 ++.LNEON_8n: ++ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b ++ sub $toutptr,sp,#128 ++ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b ++ sub $toutptr,$toutptr,$num,lsl#4 ++ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b ++ and $toutptr,$toutptr,#-64 ++ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b ++ mov sp,$toutptr // alloca ++ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b ++ add $toutptr,$toutptr,#256 ++ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b ++ sub $inner,$num,#8 ++ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b ++ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b ++ ++.LNEON_8n_init: ++ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 ++ subs $inner,$inner,#8 ++ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 ++ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 ++ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 ++ bne .LNEON_8n_init ++ ++ add $tinptr,sp,#256 ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ add $bnptr,sp,#8 ++ ldr $sM0,[$n0],#4 ++ mov $outer,$num ++ b .LNEON_8n_outer ++ ++.align 4 ++.LNEON_8n_outer: ++ ldr $sBi,[$bptr],#4 // *b++ ++ uxtl $Bi.4s,$Bi.4h ++ add $toutptr,sp,#128 ++ ld1 {$N0.4s,$N1.4s},[$nptr],#32 ++ ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ shl $Ni.2d,@ACC[0].2d,#16 ++ ext $Ni.16b,$Ni.16b,$Ni.16b,#8 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ add $Ni.2d,$Ni.2d,@ACC[0].2d ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ mul $Ni.2s,$Ni.2s,$M0.2s ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ uxtl $Ni.4s,$Ni.4h ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++for ($i=0; $i<7;) { ++$code.=<<___; ++ ldr $sBi,[$bptr],#4 // *b++ ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ uxtl $Bi.4s,$Bi.4h ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ ushr $temp.2d,@ACC[0].2d,#16 ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ ushr @ACC[0].2d,@ACC[0].2d,#16 ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d ++ ins @ACC[1].d[0],$ACCTemp.d[0] ++ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] ++___ ++ push(@ACC,shift(@ACC)); $i++; ++$code.=<<___; ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr],#16 ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ shl $Ni.2d,@ACC[0].2d,#16 ++ ext $Ni.16b,$Ni.16b,$Ni.16b,#8 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ add $Ni.2d,$Ni.2d,@ACC[0].2d ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ mul $Ni.2s,$Ni.2s,$M0.2s ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ uxtl $Ni.4s,$Ni.4h ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++} ++$code.=<<___; ++ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ mov $Temp.16b,@ACC[0].16b ++ ushr $Temp.2d,$Temp.2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ add @ACC[0].2d,@ACC[0].2d,$Temp.2d ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ ushr @ACC[0].2d,@ACC[0].2d,#16 ++ eor $temp.16b,$temp.16b,$temp.16b ++ ins @ACC[0].d[1],$temp.d[0] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d ++ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] ++ add $bnptr,sp,#8 // rewind ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ sub $inner,$num,#8 ++ b .LNEON_8n_inner ++ ++.align 4 ++.LNEON_8n_inner: ++ subs $inner,$inner,#8 ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ ld1 {$N0.4s,$N1.4s},[$nptr],#32 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ b.eq .LInner_jump ++ add $tinptr,$tinptr,#16 // don't advance in last iteration ++.LInner_jump: ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++for ($i=1; $i<8; $i++) { ++$code.=<<___; ++ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ st1 {@ACC[0].2d},[$toutptr],#16 ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ b.eq .LInner_jump$i ++ add $tinptr,$tinptr,#16 // don't advance in last iteration ++.LInner_jump$i: ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++} ++$code.=<<___; ++ b.ne .LInner_after_rewind$i ++ sub $aptr,$aptr,$num,lsl#2 // rewind ++.LInner_after_rewind$i: ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ add $bnptr,sp,#8 // rewind ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ st1 {@ACC[0].2d},[$toutptr],#16 ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ ++ bne .LNEON_8n_inner ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ add $tinptr,sp,#128 ++ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 ++ eor $N0.16b,$N0.16b,$N0.16b // $N0 ++ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 ++ eor $N1.16b,$N1.16b,$N1.16b // $N1 ++ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 ++ st1 {@ACC[6].2d},[$toutptr] ++ ++ subs $outer,$outer,#8 ++ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 ++ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 ++ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 ++ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 ++ ++ b.eq .LInner_8n_jump_2steps ++ sub $nptr,$nptr,$num,lsl#2 // rewind ++ b .LNEON_8n_outer ++ ++.LInner_8n_jump_2steps: ++ add $toutptr,sp,#128 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame ++ mov $Temp.16b,@ACC[0].16b ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ ushr $temp.2d,@ACC[0].2d,#16 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ++ ins $temp.d[1],$zero.d[0] ++ ++ mov $inner,$num ++ b .LNEON_tail_entry ++ ++.align 4 ++.LNEON_tail: ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ mov $Temp.16b,@ACC[0].16b ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 ++ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ++ ins $temp.d[1],$zero.d[0] ++ ++.LNEON_tail_entry: ++___ ++for ($i=1; $i<8; $i++) { ++$code.=<<___; ++ add @ACC[1].2d,@ACC[1].2d,$temp.2d ++ st1 {@ACC[0].s}[0], [$toutptr],#4 ++ ushr $temp.2d,@ACC[1].2d,#16 ++ mov $Temp.16b,@ACC[1].16b ++ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 ++ add @ACC[1].2d,@ACC[1].2d,$temp.2d ++ ushr $temp.2d,@ACC[1].2d,#16 ++ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h ++ ins $temp.d[1],$zero.d[0] ++___ ++ push(@ACC,shift(@ACC)); ++} ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 ++ subs $inner,$inner,#8 ++ st1 {@ACC[7].s}[0], [$toutptr],#4 ++ bne .LNEON_tail ++ ++ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit ++ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr ++ subs $aptr,sp,#0 // clear carry flag ++ add $bptr,sp,$num,lsl#2 ++ ++.LNEON_sub: ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ ldp w8,w9,[$nptr],#8 ++ ldp w10,w11,[$nptr],#8 ++ sbcs w8,w4,w8 ++ sbcs w9,w5,w9 ++ sbcs w10,w6,w10 ++ sbcs w11,w7,w11 ++ sub x17,$bptr,$aptr ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ cbnz x17,.LNEON_sub ++ ++ ldr w10, [$aptr] // load top-most bit ++ mov x11,sp ++ eor v0.16b,v0.16b,v0.16b ++ sub x11,$bptr,x11 // this is num*4 ++ eor v1.16b,v1.16b,v1.16b ++ mov $aptr,sp ++ sub $rptr,$rptr,x11 // rewind $rptr ++ mov $nptr,$bptr // second 3/4th of frame ++ sbcs w10,w10,wzr // result is carry flag ++ ++.LNEON_copy_n_zap: ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ ldp w8,w9,[$rptr],#8 ++ ldp w10,w11,[$rptr] ++ sub $rptr,$rptr,#8 ++ b.cs .LCopy_1 ++ mov w8,w4 ++ mov w9,w5 ++ mov w10,w6 ++ mov w11,w7 ++.LCopy_1: ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ sub $aptr,$aptr,#32 ++ ldp w8,w9,[$rptr],#8 ++ ldp w10,w11,[$rptr] ++ sub $rptr,$rptr,#8 ++ b.cs .LCopy_2 ++ mov w8, w4 ++ mov w9, w5 ++ mov w10, w6 ++ mov w11, w7 ++.LCopy_2: ++ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ sub x17,$bptr,$aptr // preserves carry ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ cbnz x17,.LNEON_copy_n_zap ++ ++ mov sp,x16 ++ ldp d14,d15,[sp,#64] ++ ldp d12,d13,[sp,#48] ++ ldp d10,d11,[sp,#32] ++ ldp d8,d9,[sp,#16] ++ ldr x29,[sp],#80 ++ ret // bx lr ++ ++.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon ++___ ++} ++{ + ######################################################################## + # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. + +Index: openssl-1.1.1d/crypto/bn/build.info +=================================================================== +--- openssl-1.1.1d.orig/crypto/bn/build.info ++++ openssl-1.1.1d/crypto/bn/build.info +@@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=.. + GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME) + INCLUDE[armv4-gf2m.o]=.. + GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME) ++INCLUDE[armv8-mont.o]=.. diff --git a/rpm/openssl.spec b/rpm/openssl.spec index b185770..2bb8473 100644 --- a/rpm/openssl.spec +++ b/rpm/openssl.spec @@ -26,7 +26,7 @@ %define thread_test_threads %{?threads:%{threads}}%{!?threads:1} Summary: Utilities from the general purpose cryptography library with TLS implementation Name: openssl -Version: 1.1.1l +Version: 1.1.1s # Do not forget to bump SHLIB_VERSION on version upgrades Release: 1 @@ -78,8 +78,11 @@ Patch70: openssl-1.1.1-rewire-fips-drbg.patch # Backported fixes including security fixes Patch52: openssl-1.1.1-s390x-update.patch Patch53: openssl-1.1.1-fips-crng-test.patch -Patch55: openssl-1.1.1-arm-update.patch +Patch55: openssl-1.1.1-aes-asm-aesv8-armx.pl-20-improvement-on-ThunderX2.patch Patch56: openssl-1.1.1-s390x-ecc.patch +Patch57: openssl-1_1-Optimize-AES-GCM-uarchs.patch +Patch58: openssl-1_1-Optimize-AES-XTS-aarch64.patch +Patch59: openssl-1_1-Optimize-RSA-armv8.patch License: OpenSSL URL: http://www.openssl.org/