From 89f9da2056d88ab96faf95901687cb772d2dc10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 10 Oct 2023 18:29:07 +0200
Subject: [PATCH 1/4] Bump stdarch submodule

---
 library/stdarch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/stdarch b/library/stdarch
index 333e9e9977188..f4528dd6e85d9 160000
--- a/library/stdarch
+++ b/library/stdarch
@@ -1 +1 @@
-Subproject commit 333e9e9977188d0748327e9b5be0f3f412063174
+Subproject commit f4528dd6e85d97bb802240d7cd048b6e1bf72540

From b57a1570920623d57c9e47b668896fe288d4f6e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 10 Oct 2023 18:35:28 +0200
Subject: [PATCH 2/4] Remove from cranelift codegen LLVM intrinsics that are no
 longer needed

---
 .../src/intrinsics/llvm_x86.rs                | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index 0c9a94e1c231f..559c64bb13bdd 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -32,41 +32,6 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
         }
 
-        // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8`
-        "llvm.x86.sse2.pmovmskb.128"
-        | "llvm.x86.avx2.pmovmskb"
-        | "llvm.x86.sse.movmsk.ps"
-        | "llvm.x86.sse2.movmsk.pd" => {
-            intrinsic_args!(fx, args => (a); intrinsic);
-
-            let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
-            let lane_ty = fx.clif_type(lane_ty).unwrap();
-            assert!(lane_count <= 32);
-
-            let mut res = fx.bcx.ins().iconst(types::I32, 0);
-
-            for lane in (0..lane_count).rev() {
-                let a_lane = a.value_lane(fx, lane).load_scalar(fx);
-
-                // cast float to int
-                let a_lane = match lane_ty {
-                    types::F32 => codegen_bitcast(fx, types::I32, a_lane),
-                    types::F64 => codegen_bitcast(fx, types::I64, a_lane),
-                    _ => a_lane,
-                };
-
-                // extract sign bit of an int
-                let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, i64::from(lane_ty.bits() - 1));
-
-                // shift sign bit into result
-                let a_lane_sign = clif_intcast(fx, a_lane_sign, types::I32, false);
-                res = fx.bcx.ins().ishl_imm(res, 1);
-                res = fx.bcx.ins().bor(res, a_lane_sign);
-            }
-
-            let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32));
-            ret.write_cvalue(fx, res);
-        }
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
             let (x, y, kind) = match args {
                 [x, y, kind] => (x, y, kind),

From 337af7caf76e14051721943fb67484ad8a6cff35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 10 Oct 2023 18:43:42 +0200
Subject: [PATCH 3/4] Remove from miri LLVM intrinsics that are no longer
 needed

---
 src/tools/miri/src/shims/x86/sse.rs  |  19 ---
 src/tools/miri/src/shims/x86/sse2.rs | 176 +--------------------------
 src/tools/miri/src/shims/x86/sse3.rs |  26 ----
 3 files changed, 1 insertion(+), 220 deletions(-)

diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs
index 6f0b76059f10d..831228b7a26cc 100644
--- a/src/tools/miri/src/shims/x86/sse.rs
+++ b/src/tools/miri/src/shims/x86/sse.rs
@@ -209,25 +209,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     )?;
                 }
             }
-            // Used to implement the _mm_movemask_ps function.
-            // Returns a scalar integer where the i-th bit is the highest
-            // bit of the i-th component of `op`.
-            // https://www.felixcloutier.com/x86/movmskps
-            "movmsk.ps" => {
-                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-                let (op, op_len) = this.operand_to_simd(op)?;
-
-                let mut res = 0;
-                for i in 0..op_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
-                    let op = op.to_u32()?;
-
-                    // Extract the highest bit of `op` and place it in the `i`-th bit of `res`
-                    res |= (op >> 31) << i;
-                }
-
-                this.write_scalar(Scalar::from_u32(res), dest)?;
-            }
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
         Ok(EmulateForeignItemResult::NeedsJumping)
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
index c6a847b5cf824..3f2b9f5f0adfe 100644
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@@ -1,8 +1,4 @@
-use rustc_apfloat::{
-    ieee::{Double, Single},
-    Float as _,
-};
-use rustc_middle::mir;
+use rustc_apfloat::ieee::Double;
 use rustc_middle::ty::layout::LayoutOf as _;
 use rustc_middle::ty::Ty;
 use rustc_span::Symbol;
@@ -39,49 +35,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
         // Intrinsincs sufixed with "epiX" or "epuX" operate with X-bit signed or unsigned
         // vectors.
         match unprefixed_name {
-            // Used to implement the _mm_avg_epu8 and _mm_avg_epu16 functions.
-            // Averages packed unsigned 8/16-bit integers in `left` and `right`.
-            "pavg.b" | "pavg.w" => {
-                let [left, right] =
-                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this.read_immediate(&this.project_index(&right, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Widen the operands to avoid overflow
-                    let twice_wide = this.layout_of(this.get_twice_wide_int_ty(left.layout.ty))?;
-                    let left = this.int_to_int_or_float(&left, twice_wide)?;
-                    let right = this.int_to_int_or_float(&right, twice_wide)?;
-
-                    // Calculate left + right + 1
-                    let added = this.wrapping_binary_op(mir::BinOp::Add, &left, &right)?;
-                    let added = this.wrapping_binary_op(
-                        mir::BinOp::Add,
-                        &added,
-                        &ImmTy::from_uint(1u32, twice_wide),
-                    )?;
-
-                    // Calculate (left + right + 1) / 2
-                    let divided = this.wrapping_binary_op(
-                        mir::BinOp::Div,
-                        &added,
-                        &ImmTy::from_uint(2u32, twice_wide),
-                    )?;
-
-                    // Narrow back to the original type
-                    let res = this.int_to_int_or_float(&divided, dest.layout)?;
-                    this.write_immediate(*res, &dest)?;
-                }
-            }
             // Used to implement the _mm_madd_epi16 function.
             // Multiplies packed signed 16-bit integers in `left` and `right`, producing
             // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
@@ -118,70 +71,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this.write_scalar(Scalar::from_i32(res), &dest)?;
                 }
             }
-            // Used to implement the _mm_mulhi_epi16 and _mm_mulhi_epu16 functions.
-            "pmulh.w" | "pmulhu.w" => {
-                let [left, right] =
-                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this.read_immediate(&this.project_index(&right, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Widen the operands to avoid overflow
-                    let twice_wide = this.layout_of(this.get_twice_wide_int_ty(left.layout.ty))?;
-                    let left = this.int_to_int_or_float(&left, twice_wide)?;
-                    let right = this.int_to_int_or_float(&right, twice_wide)?;
-
-                    // Multiply
-                    let multiplied = this.wrapping_binary_op(mir::BinOp::Mul, &left, &right)?;
-                    // Keep the high half
-                    let high = this.wrapping_binary_op(
-                        mir::BinOp::Shr,
-                        &multiplied,
-                        &ImmTy::from_uint(dest.layout.size.bits(), twice_wide),
-                    )?;
-
-                    // Narrow back to the original type
-                    let res = this.int_to_int_or_float(&high, dest.layout)?;
-                    this.write_immediate(*res, &dest)?;
-                }
-            }
-            // Used to implement the _mm_mul_epu32 function.
-            // Multiplies the the low unsigned 32-bit integers from each packed
-            // 64-bit element and stores the result as 64-bit unsigned integers.
-            "pmulu.dq" => {
-                let [left, right] =
-                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                // left and right are u32x4, dest is u64x2
-                assert_eq!(left_len, 4);
-                assert_eq!(right_len, 4);
-                assert_eq!(dest_len, 2);
-
-                for i in 0..dest_len {
-                    let op_i = i.checked_mul(2).unwrap();
-                    let left = this.read_scalar(&this.project_index(&left, op_i)?)?.to_u32()?;
-                    let right = this.read_scalar(&this.project_index(&right, op_i)?)?.to_u32()?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // The multiplication will not overflow because stripping the
-                    // operands are expanded from 32-bit to 64-bit.
-                    let res = u64::from(left).checked_mul(u64::from(right)).unwrap();
-                    this.write_scalar(Scalar::from_u64(res), &dest)?;
-                }
-            }
             // Used to implement the _mm_sad_epu8 function.
             // Computes the absolute differences of packed unsigned 8-bit integers in `a`
             // and `b`, then horizontally sum each consecutive 8 differences to produce
@@ -370,25 +259,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this.write_scalar(Scalar::from_u64(res), &dest)?;
                 }
             }
-            // Used to implement the _mm_cvtepi32_ps function.
-            // Converts packed i32 to packed f32.
-            // FIXME: Can we get rid of this intrinsic and just use simd_as?
-            "cvtdq2ps" => {
-                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (op, op_len) = this.operand_to_simd(op)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                assert_eq!(dest_len, op_len);
-
-                for i in 0..dest_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?.to_i32()?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let res = Scalar::from_f32(Single::from_i128(op.into()).value);
-                    this.write_scalar(res, &dest)?;
-                }
-            }
             // Used to implement the _mm_cvtps_epi32 and _mm_cvttps_epi32 functions.
             // Converts packed f32 to packed i32.
             "cvtps2dq" | "cvttps2dq" => {
@@ -652,31 +522,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 };
                 this.write_scalar(Scalar::from_i32(i32::from(res)), dest)?;
             }
-            // Used to implement the _mm_cvtpd_ps and _mm_cvtps_pd functions.
-            // Converts packed f32/f64 to packed f64/f32.
-            "cvtpd2ps" | "cvtps2pd" => {
-                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (op, op_len) = this.operand_to_simd(op)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                // For cvtpd2ps: op is f64x2, dest is f32x4
-                // For cvtps2pd: op is f32x4, dest is f64x2
-                // In either case, the two first values are converted
-                for i in 0..op_len.min(dest_len) {
-                    let op = this.read_immediate(&this.project_index(&op, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let res = this.float_to_float_or_int(&op, dest.layout)?;
-                    this.write_immediate(*res, &dest)?;
-                }
-                // For f32 -> f64, ignore the remaining
-                // For f64 -> f32, fill the remaining with zeros
-                for i in op_len..dest_len {
-                    let dest = this.project_index(&dest, i)?;
-                    this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
-                }
-            }
             // Used to implement the _mm_cvtpd_epi32 and _mm_cvttpd_epi32 functions.
             // Converts packed f64 to packed i32.
             "cvtpd2dq" | "cvttpd2dq" => {
@@ -772,25 +617,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     )?;
                 }
             }
-            // Used to implement the _mm_movemask_pd function.
-            // Returns a scalar integer where the i-th bit is the highest
-            // bit of the i-th component of `op`.
-            // https://www.felixcloutier.com/x86/movmskpd
-            "movmsk.pd" => {
-                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-                let (op, op_len) = this.operand_to_simd(op)?;
-
-                let mut res = 0;
-                for i in 0..op_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
-                    let op = op.to_u64()?;
-
-                    // Extract the highest bit of `op` and place it in the `i`-th bit of `res`
-                    res |= (op >> 63) << i;
-                }
-
-                this.write_scalar(Scalar::from_u32(res.try_into().unwrap()), dest)?;
-            }
             // Used to implement the `_mm_pause` function.
             // The intrinsic is used to hint the processor that the code is in a spin-loop.
             "pause" => {
diff --git a/src/tools/miri/src/shims/x86/sse3.rs b/src/tools/miri/src/shims/x86/sse3.rs
index a41de5dbf7ee5..20a4b560749ac 100644
--- a/src/tools/miri/src/shims/x86/sse3.rs
+++ b/src/tools/miri/src/shims/x86/sse3.rs
@@ -23,32 +23,6 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
         let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse3.").unwrap();
 
         match unprefixed_name {
-            // Used to implement the _mm_addsub_ps and _mm_addsub_pd functions.
-            // Alternatingly add and subtract floating point (f32 or f64) from
-            // `left` and `right`
-            "addsub.ps" | "addsub.pd" => {
-                let [left, right] =
-                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this.read_immediate(&this.project_index(&right, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Even elements are subtracted and odd elements are added.
-                    let op = if i % 2 == 0 { mir::BinOp::Sub } else { mir::BinOp::Add };
-                    let res = this.wrapping_binary_op(op, &left, &right)?;
-
-                    this.write_immediate(*res, &dest)?;
-                }
-            }
             // Used to implement the _mm_h{add,sub}_p{s,d} functions.
             // Horizontally add/subtract adjacent floating point values
             // in `left` and `right`.

From 35e2f4e0af4bfce8802239ac498255ee9b02fbb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 10 Oct 2023 18:44:53 +0200
Subject: [PATCH 4/4] Fix identation of a `rustfmt::skip`ed statement

---
 src/tools/miri/tests/pass/intrinsics-x86-sse2.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs b/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs
index 2c7665bc73631..e636d6c8aaf8d 100644
--- a/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs
+++ b/src/tools/miri/tests/pass/intrinsics-x86-sse2.rs
@@ -117,12 +117,12 @@ mod tests {
         #[target_feature(enable = "sse2")]
         unsafe fn test_mm_sad_epu8() {
             #[rustfmt::skip]
-        let a = _mm_setr_epi8(
-            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
-            1, 2, 3, 4,
-            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
-            1, 2, 3, 4,
-        );
+            let a = _mm_setr_epi8(
+                255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
+                1, 2, 3, 4,
+                155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
+                1, 2, 3, 4,
+            );
             let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
             let r = _mm_sad_epu8(a, b);
             let e = _mm_setr_epi64x(1020, 614);