Auto merge of #116479 - scottmcm:no-1-simd, r=<try>

Copy 1-/2-element arrays as scalars, not vectors For `[T; 1]` it's silly to copy as `<1 x T>` when we can just copy as `T`. And treat `[T; 2]` as a scalar pair (like `(T, T)`) when copying it. Inspired by #101210 (comment), which pointed out that `Option<[u8; 1]>` was codegenning worse than `Option<u8>`. (I'm not sure *why* LLVM doesn't optimize out `<1 x u8>`, but might as well just not emit it in the first place in this codepath.)
rust-lang · Oct 6, 2023 · b571f53 · b571f53
2 parents 1bc0463 + b5a9dd7
commit b571f53
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 5 deletions.
diff --git a/compiler/rustc_codegen_llvm/src/type_of.rs b/compiler/rustc_codegen_llvm/src/type_of.rs
@@ -383,9 +383,6 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
 
         // Vectors, even for non-power-of-two sizes, have the same layout as
         // arrays but don't count as aggregate types
-        // While LLVM theoretically supports non-power-of-two sizes, and they
-        // often work fine, sometimes x86-isel deals with them horribly
-        // (see #115212) so for now only use power-of-two ones.
         if let FieldsShape::Array { count, .. } = self.layout.fields()
             && count.is_power_of_two()
             && let element = self.field(cx, 0)
@@ -396,8 +393,19 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
             // up suppressing vectorization as it introduces shifts when it
             // extracts all the individual values.
 
-            let ety = element.llvm_type(cx);
-            return Some(cx.type_vector(ety, *count));
+            if *count <= 2 {
+                // For short arrays, use LLVM's array type which it will unpack
+                // out in optimizations to a scalar or pair of scalars.
+                // (Having types like `<1 x u8>` is silly.)
+                let ety = element.llvm_type(cx);
+                return Some(cx.type_array(ety, *count));
+            } else if count.is_power_of_two() {
+                // While LLVM theoretically supports non-power-of-two sizes, and they
+                // often work fine, sometimes x86-isel deals with them horribly
+                // (see #115212) so for now only use power-of-two ones.
+                let ety = element.llvm_type(cx);
+                return Some(cx.type_vector(ety, *count));
+            }
         }
 
         // FIXME: The above only handled integer arrays; surely more things

diff --git a/tests/assembly/x86_64-array-pair-load-store-merge.rs b/tests/assembly/x86_64-array-pair-load-store-merge.rs
@@ -0,0 +1,18 @@
+// assembly-output: emit-asm
+// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel
+// only-x86_64
+// ignore-sgx
+
+// This is emitted in a way that results in two loads and two stores in LLVM-IR.
+// Confirm that that doesn't mean 4 instructions in assembly.
+
+// CHECK-LABEL: array_copy_2_elements:
+#[no_mangle]
+pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
+    // CHECK-NOT: byte
+    // CHECK-NOT: mov
+    // CHECK: mov{{.+}}, word ptr
+    // CHECK-NEXT: mov word ptr
+    // CHECK-NEXT: ret
+    *p = *a;
+}
diff --git a/tests/codegen/array-clone.rs b/tests/codegen/array-clone.rs
@@ -1,4 +1,5 @@
 // compile-flags: -O
+// min-llvm-version: 17 (earlier versions have trouble merging the loads)
 
 #![crate_type = "lib"]
 

diff --git a/tests/codegen/array-codegen.rs b/tests/codegen/array-codegen.rs
@@ -32,3 +32,25 @@ pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
     // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
     *p = *a;
 }
+
+// CHECK-LABEL: @array_copy_1_element
+#[no_mangle]
+pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
+    // CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load [1 x i8], ptr %a, align 1
+    // CHECK: store [1 x i8] %[[TEMP1]], ptr %[[LOCAL]], align 1
+    // CHECK: %[[TEMP2:.+]] = load [1 x i8], ptr %[[LOCAL]], align 1
+    // CHECK: store [1 x i8] %[[TEMP2]], ptr %p, align 1
+    *p = *a;
+}
+
+// CHECK-LABEL: @array_copy_2_elements
+#[no_mangle]
+pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
+    // CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load [2 x i8], ptr %a, align 1
+    // CHECK: store [2 x i8] %[[TEMP1]], ptr %[[LOCAL]], align 1
+    // CHECK: %[[TEMP2:.+]] = load [2 x i8], ptr %[[LOCAL]], align 1
+    // CHECK: store [2 x i8] %[[TEMP2]], ptr %p, align 1
+    *p = *a;
+}
diff --git a/tests/codegen/array-optimized.rs b/tests/codegen/array-optimized.rs
@@ -0,0 +1,35 @@
+// compile-flags: -O
+
+#![crate_type = "lib"]
+
+// CHECK-LABEL: @array_copy_1_element
+#[no_mangle]
+pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[TEMP:.+]] = load i8, ptr %a, align 1
+    // CHECK: store i8 %[[TEMP]], ptr %p, align 1
+    // CHECK: ret
+    *p = *a;
+}
+
+// CHECK-LABEL: @array_copy_2_elements
+#[no_mangle]
+pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
+    // CHECK: %[[TEMP2:.+]] = load i8, ptr
+    // CHECK: store i8 %[[TEMP1]], ptr %p, align 1
+    // CHECK: store i8 %[[TEMP2]], ptr
+    // CHECK: ret
+    *p = *a;
+}
+
+// CHECK-LABEL: @array_copy_4_elements
+#[no_mangle]
+pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
+    // CHECK: ret
+    *p = *a;
+}