Remove unnecessary __at_align32__ in int_elementwise_binary_256 (#45470)

Summary: They were added in 4b3046e based on a misunderstanding of `_mm256_storeu_si256`, but they are actually unnecessary. The [document][1] of `_mm256_storeu_si256` says: > Moves values from a integer vector to an **unaligned** memory location. In this case, it's better to remove the `__at_align32__` qualifier to leave the compiler and linker more flexibility to optimize. [1]: https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html Close #44810 Pull Request resolved: #45470 Reviewed By: zhangguanheng66 Differential Revision: D23980060 Pulled By: glaringlee fbshipit-source-id: 12b3558b76c6e81d88a72081060fdb8674464768
pytorch · Sep 29, 2020 · 0df99ad · 0df99ad
1 parent 6e55a26
commit 0df99ad
Showing 1 changed file with 10 additions and 2 deletions.
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -104,6 +104,8 @@ class Vec256<int64_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size()];
@@ -228,6 +230,8 @@ class Vec256<int32_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size()];
@@ -449,6 +453,8 @@ class Vec256<int16_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size()];
@@ -699,6 +705,8 @@ class Vec256<int8_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int8_t tmp_values[size()];
@@ -879,8 +887,8 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 
 template <typename T, typename Op>
 Vec256<T> inline int_elementwise_binary_256(const Vec256<T>& a, const Vec256<T>& b, Op op) {
-  __at_align32__ T values_a[Vec256<T>::size()];
-  __at_align32__ T values_b[Vec256<T>::size()];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
   a.store(values_a);
   b.store(values_b);
   for (int i = 0; i != Vec256<T>::size(); i++) {