[SVE] Remove all instances of sizeless structures #310

shibatch · 2020-05-22T04:38:30Z

This patch removes all instances of sizeless structures, as stated in issue #309.

It still does not build with gcc-10 with SVE because of the ICE. Please help me coming up with a workaround.

PierreBlanchard · 2020-05-22T14:55:40Z

Hello Naoki! Thanks a lot for working on that patch.
I got an ICE as well (see below), can you confirm it is the same one you get? I'm trying to extract a small reproducer and I will investigate where it could be coming from.

during RTL pass: pro_and_epilogue
/home/piebla01/math-libs/sleef/src/libm/sleefsimdsp.c: In function ‘_ZGVsNxv_sinf_u35’:
/home/piebla01/math-libs/sleef/src/libm/sleefsimdsp.c:510:1: internal compiler error: Segmentation fault
  510 | }
      | ^
0xb35063 crash_signal
	/space/piebla01/gcc-10.1.0/gcc/toplev.c:328
0xdd0598 aarch64_expand_epilogue(bool)
	/space/piebla01/gcc-10.1.0/gcc/config/aarch64/aarch64.c:8188
0x1082ecb gen_epilogue()
	/space/piebla01/gcc-10.1.0/gcc/config/aarch64/aarch64.md:829
0xdc892b target_gen_epilogue
	/space/piebla01/gcc-10.1.0/gcc/config/aarch64/aarch64.md:7245
0x8985db make_epilogue_seq
	/space/piebla01/gcc-10.1.0/gcc/function.c:5823
0x898b5f thread_prologue_and_epilogue_insns()
	/space/piebla01/gcc-10.1.0/gcc/function.c:5926
0x898c2f rest_of_handle_thread_prologue_and_epilogue
	/space/piebla01/gcc-10.1.0/gcc/function.c:6395
0x898c2f execute
	/space/piebla01/gcc-10.1.0/gcc/function.c:6471

shibatch · 2020-05-23T03:55:06Z

Hello Pierre,

I have already reported the ICE to Francesco, but the following is the shortened source code for reproducing the ICE.

// gcc-10 -O2 -march=armv8-a+sve bug-sve-gcc10.c -S

#include <arm_sve.h>

static svint32_t vrint_vi2_vf(svfloat32_t d) {
  return svcvt_s32_f32_x(svptrue_b8(), svrintn_f32_x(svptrue_b8(), d));
}
static svfloat32_t vf2gety_vf_vf2(svfloat32x2_t v) { return svget2_f32(v, 1); }
static svfloat32x2_t vf2setx_vf2_vf2_vf(svfloat32x2_t v, svfloat32_t d) { return svset2_f32(v, 0, d); }
static svfloat32x2_t dfmul_vf2_vf_vf(svfloat32_t x, svfloat32_t y) {
  return svcreate2_f32(svmul_f32_x(svptrue_b8(), x, y), svmul_f32_x(svptrue_b8(), x, y));
}
static svfloat32x2_t dfadd2_vf2_vf2_vf2(svfloat32x2_t x, svfloat32x2_t y) {
  return svcreate2_f32(svadd_f32_x(svptrue_b8(), svget2_f32(x, 0), svget2_f32(y, 0)),
                       svadd_f32_x(svptrue_b8(), svget2_f32(x, 1), vf2gety_vf_vf2(y)));
}
static svint32_t vilogb2k_vi2_vf(svfloat32_t d) {
  svint32_t q = svreinterpret_s32_f32(d);
  q = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b8(), svreinterpret_u32_s32(q), 23));
  return svsub_s32_x(svptrue_b8(), q, svdup_n_s32(0x7f));
}
static svfloat32x2_t rempisubf(svfloat32_t x) {
  svfloat32_t y = svrintn_f32_x(svptrue_b32(), svmul_f32_x(svptrue_b8(), x, svdup_n_f32(4)));
  svint32_t vi = svcvt_s32_f32_x(svptrue_b8(), svsub_f32_x(svptrue_b8(), y, svmul_f32_x(svptrue_b8(), svrintn_f32_x(svptrue_b32(), x), svdup_n_f32(4))));
  return svcreate2_f32(svsub_f32_x(svptrue_b8(), x, svmul_f32_x(svptrue_b8(), y, svdup_n_f32(0.25))), svreinterpret_f32_s32(vi));
}
static svfloat32x3_t rempif(svfloat32_t a) {
  extern const float rempitabsp[];
  svfloat32x2_t x, y;
  svint32_t ex = vilogb2k_vi2_vf(a), q = ex;
  x = dfmul_vf2_vf_vf(a, svld1_gather_s32index_f32(svptrue_b8(), rempitabsp, ex));
  svfloat32x2_t di = rempisubf(svget2_f32(x, 0));
  q = svreinterpret_s32_f32(svget2_f32(di, 1));
  x = dfadd2_vf2_vf2_vf2(x, y);
  di = rempisubf(svget2_f32(x, 0));
  x = vf2setx_vf2_vf2_vf(x, svget2_f32(di, 0));
  y = svcreate2_f32(svld1_gather_s32index_f32(svptrue_b8(), rempitabsp+2, ex), svld1_gather_s32index_f32(svptrue_b8(), rempitabsp+3, ex));
  x = dfadd2_vf2_vf2_vf2(x, y);
  return svcreate3_f32(svget2_f32(x, 0), svget2_f32(x, 1), svreinterpret_f32_s32(q));
}
svfloat32_t xsinf(svfloat32_t d) {
  svfloat32_t u, r = d;
  if (svcntp_b32(svptrue_b32(), svcmplt_f32(svptrue_b8(), svabs_f32_x(svptrue_b8(), d), svdup_n_f32(0))) == svcntw()) {
    u = svcvt_f32_s32_x(svptrue_b8(), vrint_vi2_vf(d));
    d = svadd_f32_x(svptrue_b8(), svmul_f32_x(svptrue_b8(), u, u), d);
  } else {
    svfloat32x3_t dfi = rempif(d);
    d = svadd_f32_x(svptrue_b8(), svget3_f32(dfi, 0), svget3_f32(dfi, 1));
    d = svreinterpret_f32_s32(svsel_s32(svcmpeq_n_f32(svptrue_b8(),r, __builtin_inff()), svdup_n_s32(0xffffffff), svreinterpret_s32_f32(d)));
  }
  return svadd_f32_x(svptrue_b8(), svmul_f32_x(svptrue_b8(), svdup_n_f32(0.2f), d), svdup_n_f32(0.1f));
}

rsandifo-arm · 2020-05-28T18:13:54Z

Thanks for reporting the GCC ICE. I've just pushed a fix to the gcc-10 branch . The patch also applies cleanly to the GCC 10.1 sources, if you're building from tarballs rather than git.

A workaround for broken compilers is to compile with -fno-shrink-wrap, although that will lose some optimisation.

* Update CI settings

shibatch · 2020-05-29T02:51:19Z

Hello Richard,

Thank you for fixing the bug. I still cannot build the library with gcc-10 because of "Error: unknown pseudo-op: `.variant_pcs'." I have to try with newer binutils.

shibatch · 2020-05-30T12:05:46Z

I believe the following is another bug in gcc-10.

[neon]~/work/sleef3/bug/sve-gcc10-2$ cat bug2.c
#include <stdio.h>
#include <stdint.h>
#include <arm_sve.h>

int main(void) {
  uint64_t a[64], u[svcntd()];
  for(int i=0;i<sizeof(a);i++) ((unsigned char *)&a)[i] = i;
  svst1_s32(svptrue_b8(), (void *)&u, svld1_s32(svptrue_b8(), (int32_t *)&a));
  for(int i=0;i<svcntd();i++) printf("%016lx:", (unsigned long)(u[i]));
  printf("\n");
}
[neon]~/work/sleef3/bug/sve-gcc10-2$ armclang -O3 -march=armv8-a+sve bug2.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
0706050403020100:0f0e0d0c0b0a0908:
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O1 -march=armv8-a+sve bug2.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
0706050403020100:0f0e0d0c0b0a0908:
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O2 -march=armv8-a+sve bug2.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
0000007f97201ec8:0000000000000000:
[neon]~/work/sleef3/bug/sve-gcc10-2$

shibatch · 2020-05-30T12:10:55Z

And this one?

[neon]~/work/sleef3/bug/sve-gcc10-2$ cat bug3.c
#include <stdio.h>
#include <arm_sve.h>

int main(void) {
  for(int i=0;i<svcntd();i++) printf("x");
  printf("\n");
}
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie --version
20.0-14
[neon]~/work/sleef3/bug/sve-gcc10-2$ armclang -O3 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxx
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O1 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxx
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O2 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
xx
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxxxxxxxxxxxxxxxxxxxxxxxx (gets into an infinite loop)

shibatch · 2020-05-30T15:12:36Z

The optimization flags causing those bugs are -fstrict-aliasing for bug2.c, and -ftree-vrp for bug3.c.

shibatch · 2020-05-31T10:58:27Z

So, I succeeded in building the library with "-fno-shrink-wrap -fno-strict-aliasing -fno-tree-vrp" options. I added a new CI setting with gcc-10 to .travis.yml. In this setting, binutils-2.34 and qemu 5.0.0 built by me are used. These are downloaded from my web server and installed under /opt/local.

rsandifo-arm · 2020-06-01T11:32:51Z

Thanks for raising this.

I'll try to fix bug3.c for GCC 10.2 and report back. Thanks for tracking down which pass caused it.

For bug2.c: this kind of behaviour is actually expected. The reason is that, at the language level, the SVE load and store intrinsics act very much like the individual scalar accesses would. For example, svld1_s32 loads data as int32_t elements rather than as typeless data or individual bytes. This means that the aliasing rules for svld1_s32 and svst1_s32 are the same as they are for individual int32_t accesses. The code therefore invokes undefined behaviour because it accesses uint64_t objects as though they were int32_t objects.

If an array contains uint64_t objects, then according to the default aliasing rules, they need to be loaded and stored using svld1_u64 and svst1_u64. The vectors can be converted to and from the processing type using things like svreinterpret_s32_u64.

Like you say, -fno-strict-aliasing is supported as a way of opting out of the alias rules, if you'd prefer to keep things as they are. This is likely to reduce optimisation though.

shibatch · 2020-06-01T15:27:48Z

Thank you for your explanation. But I still don't understand why it is undefined.
Is it because the alignments of those two data types are different? I believe casting from uint64_t * to int32_t * is permitted. Is it because conversion between those two data types are not defined? I believe conversion is straightforward.

[neon]~/work/sleef3/bug/sve-gcc10-2$ cat bug2y.c
#include <stdio.h>
#include <arm_sve.h>

int main(void) {
  uint32_t u[svcntw()];
  svst1_u64(svptrue_b8(), (void *)u, svdup_n_u64(1));
  for(int i=0;i<svcntw();i++) printf("%08x", (unsigned int)(u[i]));
  printf("\n");
  //
  uint64_t v[svcntd()];
  svst1_u32(svptrue_b8(), (void *)v, svdup_n_u32(1));
  for(int i=0;i<svcntd();i++) printf("%016lx", (unsigned long int)(v[i]));
  printf("\n");
}
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -Wall -Wcast-align -O1 -march=armv8-a+sve bug2y.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
00000001000000000000000100000000
00000001000000010000000100000001
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -Wall -Wcast-align -O2 -march=armv8-a+sve bug2y.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
d6e3d6800000007fa74b51e00000007f
00000000000000040000007fa3a26000
[neon]~/work/sleef3/bug/sve-gcc10-2$

rsandifo-arm · 2020-06-01T16:08:25Z

Hi. The problem isn't so much with the cast itself, but that the intrinsics implicitly dereference their pointer arguments. C and C++ only allow a pointer ptr of type T * to be dereferenced if the “real” runtime type of the object at ptr is compatible with T up to a certain point. The exact rules are quite complicated, but https://stackoverflow.com/a/51228315 has a good write-up.

So for example in:

uint64_t x;
for (int i = 0; i < sizeof(uint64_t); ++i) ((unsigned char *)&x)[i] = 1; // OK
*(uint32_t *)&x = 2; // UB

the loop setting up the uint64_t is OK (and the loop in bug2.c is OK) because accessing a uint64_t using character types is explicitly allowed. But accessing the uint64_t object as though it had uint32_t is undefined behaviour. The intrinsics inherit this behaviour because (at the language level) they act like individual scalar accesses.

shibatch · 2020-06-02T04:02:31Z

I modified the code in SLEEF so that it complies with the aliasing rules.
I need additional modification to the DFT, and I will do it in another PR.

PierreBlanchard · 2020-06-24T12:50:09Z

Dear Naoki,
Thanks to your patch I was able to compile the SVE routines successfully with a gcc-10.1.0 (+latest binutils) built locally (from a tarball). All tests pass except the fft ones but I guess that is the DFT related issue you mentioned in your previous message, that you are looking to fix. I did not have issue running them either.
I had a look at the code earlier this month and it all looked good to me.
Thanks a lot for this effort as well as @rsandifo-arm for your help on this!

fpetrogalli

Approving on behalf of @PierreBlanchard

Thank you Naoki for working on this, and for raising the issues with gcc.

Thank you @rsandifo-arm for fixing the issues, and thank you @PierreBlanchard for verifying the patch.

Good team work! :)

Francesco

rsandifo-arm · 2020-07-23T07:39:37Z

[neon]~/work/sleef3/bug/sve-gcc10-2$ cat bug3.c
#include <stdio.h>
#include <arm_sve.h>

int main(void) {
  for(int i=0;i<svcntd();i++) printf("x");
  printf("\n");
}
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie --version
20.0-14
[neon]~/work/sleef3/bug/sve-gcc10-2$ armclang -O3 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxx
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O1 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxx
[neon]~/work/sleef3/bug/sve-gcc10-2$ gcc-10 -fno-shrink-wrap -O2 -march=armv8-a+sve bug3.c
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=128 ./a.out
xx
[neon]~/work/sleef3/bug/sve-gcc10-2$ armie -msve-vector-bits=256 ./a.out
xxxxxxxxxxxxxxxxxxxxxxxxxx (gets into an infinite loop)

Thanks for reporting this. It was fixed in GCC 10.2, which was released earlier today.

shibatch added 2 commits May 22, 2020 10:36

no message

a79d90f

no message

d0088c3

shibatch requested a review from fpetrogalli May 22, 2020 04:38

shibatch added 2 commits May 29, 2020 11:38

* Pass -fno-shrink-wrap flags to aarch64 gcc-10.1.

b9ac9a8

* Update CI settings

no message

01d87a6

shibatch added 3 commits May 29, 2020 11:53

no message

115cb94

no message

9e44916

no message

cf9ba27

shibatch added 6 commits May 31, 2020 00:34

no message

d88dc1e

no message

a29190d

no message

9524450

no message

082c603

no message

ac811ec

no message

ddcc6d1

Making the libm part conformant to the strict-aliasing rules

a2ef3df

shibatch added 2 commits June 2, 2020 13:42

no message

368f0f2

no message

d0aa893

fpetrogalli approved these changes Jun 24, 2020

View reviewed changes

fpetrogalli merged commit cc5f3cd into master Jun 24, 2020

shibatch deleted the Remove_all_instances_of_sizeless_structures branch August 28, 2020 11:54

ggardet mentioned this pull request Oct 7, 2020

Build broken on aarch64 with GCC10 because of broken SVE code in third_party/sleef pytorch/pytorch#45971

Closed

blapie mentioned this pull request Nov 22, 2023

[SVE] Remove all instances of sizeless structures #309

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SVE] Remove all instances of sizeless structures #310

[SVE] Remove all instances of sizeless structures #310

shibatch commented May 22, 2020

PierreBlanchard commented May 22, 2020

shibatch commented May 23, 2020 •

edited

Loading

rsandifo-arm commented May 28, 2020

shibatch commented May 29, 2020

shibatch commented May 30, 2020

shibatch commented May 30, 2020

shibatch commented May 30, 2020

shibatch commented May 31, 2020 •

edited

Loading

rsandifo-arm commented Jun 1, 2020

shibatch commented Jun 1, 2020

rsandifo-arm commented Jun 1, 2020

shibatch commented Jun 2, 2020

PierreBlanchard commented Jun 24, 2020 •

edited

Loading

fpetrogalli left a comment

rsandifo-arm commented Jul 23, 2020

[SVE] Remove all instances of sizeless structures #310

[SVE] Remove all instances of sizeless structures #310

Conversation

shibatch commented May 22, 2020

PierreBlanchard commented May 22, 2020

shibatch commented May 23, 2020 • edited Loading

rsandifo-arm commented May 28, 2020

shibatch commented May 29, 2020

shibatch commented May 30, 2020

shibatch commented May 30, 2020

shibatch commented May 30, 2020

shibatch commented May 31, 2020 • edited Loading

rsandifo-arm commented Jun 1, 2020

shibatch commented Jun 1, 2020

rsandifo-arm commented Jun 1, 2020

shibatch commented Jun 2, 2020

PierreBlanchard commented Jun 24, 2020 • edited Loading

fpetrogalli left a comment

Choose a reason for hiding this comment

rsandifo-arm commented Jul 23, 2020

shibatch commented May 23, 2020 •

edited

Loading

shibatch commented May 31, 2020 •

edited

Loading

PierreBlanchard commented Jun 24, 2020 •

edited

Loading