From d2a2fd3c4b38356e69248b49f4a51a9400e508e1 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 26 Oct 2025 16:38:46 +0100 Subject: [PATCH 01/17] forward `TEST_SAMPLE_INTRINSICS_PERCENTAGE` --- library/stdarch/ci/intrinsic-test-docker.sh | 1 + library/stdarch/ci/intrinsic-test.sh | 9 +++++++-- .../stdarch/crates/intrinsic-test/src/arm/mod.rs | 8 ++++++-- .../crates/intrinsic-test/src/common/compare.rs | 6 +++++- .../stdarch/crates/intrinsic-test/src/x86/mod.rs | 15 +++++++-------- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/library/stdarch/ci/intrinsic-test-docker.sh b/library/stdarch/ci/intrinsic-test-docker.sh index 038fc4678ed2e..f62d7e484f5b1 100755 --- a/library/stdarch/ci/intrinsic-test-docker.sh +++ b/library/stdarch/ci/intrinsic-test-docker.sh @@ -36,6 +36,7 @@ run() { --env NORUN \ --env RUSTFLAGS \ --env CARGO_UNSTABLE_BUILD_STD \ + --env TEST_SAMPLE_INTRINSICS_PERCENTAGE \ --volume "${HOME}/.cargo":/cargo \ --volume "$(rustc --print sysroot)":/rust:ro \ --volume "$(pwd)":/checkout:ro \ diff --git a/library/stdarch/ci/intrinsic-test.sh b/library/stdarch/ci/intrinsic-test.sh index e14a824b2ae66..be309f9e42f3e 100755 --- a/library/stdarch/ci/intrinsic-test.sh +++ b/library/stdarch/ci/intrinsic-test.sh @@ -51,6 +51,7 @@ case ${TARGET} in TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER}" + : "${TEST_SAMPLE_INTRINSICS_PERCENTAGE:=100}" ;; aarch64_be-unknown-linux-gnu*) @@ -58,6 +59,7 @@ case ${TARGET} in TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_aarch64.txt TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER}" + : "${TEST_SAMPLE_INTRINSICS_PERCENTAGE:=100}" ;; armv7-unknown-linux-gnueabihf*) @@ -65,6 +67,7 @@ case ${TARGET} in TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_arm.txt TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}" + : "${TEST_SAMPLE_INTRINSICS_PERCENTAGE:=100}" ;; x86_64-unknown-linux-gnu*) @@ -72,7 +75,7 @@ case ${TARGET} in TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt - TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 + : "${TEST_SAMPLE_INTRINSICS_PERCENTAGE:=5}" ;; *) ;; @@ -88,7 +91,8 @@ case "${TARGET}" in --runner "${TEST_RUNNER}" \ --cppcompiler "${TEST_CXX_COMPILER}" \ --skip "${TEST_SKIP_INTRINSICS}" \ - --target "${TARGET}" + --target "${TARGET}" \ + --sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}" ;; aarch64_be-unknown-linux-gnu*) @@ -99,6 +103,7 @@ case "${TARGET}" in --cppcompiler "${TEST_CXX_COMPILER}" \ --skip "${TEST_SKIP_INTRINSICS}" \ --target "${TARGET}" \ + --sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}" \ --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \ --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}" ;; diff --git a/library/stdarch/crates/intrinsic-test/src/arm/mod.rs b/library/stdarch/crates/intrinsic-test/src/arm/mod.rs index 7fa5062e86522..99c8da854c506 100644 --- a/library/stdarch/crates/intrinsic-test/src/arm/mod.rs +++ b/library/stdarch/crates/intrinsic-test/src/arm/mod.rs @@ -48,8 +48,12 @@ impl SupportedArchitectureTest for ArmArchitectureTest { .expect("Error parsing input file"); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); + intrinsics.dedup(); + + let sample_percentage: usize = cli_options.sample_percentage as usize; + let sample_size = (intrinsics.len() * sample_percentage) / 100; - let mut intrinsics = intrinsics + let intrinsics = intrinsics .into_iter() // Not sure how we would compare intrinsic that returns void. .filter(|i| i.results.kind() != TypeKind::Void) @@ -61,8 +65,8 @@ impl SupportedArchitectureTest for ArmArchitectureTest { .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128)) .filter(|i| !cli_options.skip.contains(&i.name)) .filter(|i| !(a32 && i.arch_tags == vec!["A64".to_string()])) + .take(sample_size) .collect::>(); - intrinsics.dedup(); Self { intrinsics, diff --git a/library/stdarch/crates/intrinsic-test/src/common/compare.rs b/library/stdarch/crates/intrinsic-test/src/common/compare.rs index 902df94283fd6..c0459b743a7f0 100644 --- a/library/stdarch/crates/intrinsic-test/src/common/compare.rs +++ b/library/stdarch/crates/intrinsic-test/src/common/compare.rs @@ -86,6 +86,10 @@ pub fn compare_outputs(intrinsic_name_list: &Vec, runner: &str, target: println!("Failed to run rust program for intrinsic {intrinsic}") } }); - println!("{} differences found", intrinsics.len()); + println!( + "{} differences found (tested {} intrinsics)", + intrinsics.len(), + intrinsic_name_list.len() + ); intrinsics.is_empty() } diff --git a/library/stdarch/crates/intrinsic-test/src/x86/mod.rs b/library/stdarch/crates/intrinsic-test/src/x86/mod.rs index 956e51836f3f7..4adf85017bc13 100644 --- a/library/stdarch/crates/intrinsic-test/src/x86/mod.rs +++ b/library/stdarch/crates/intrinsic-test/src/x86/mod.rs @@ -11,7 +11,6 @@ use crate::common::compile_c::CppCompilation; use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; use intrinsic::X86IntrinsicType; -use itertools::Itertools; use xml_parser::get_xml_intrinsics; pub struct X86ArchitectureTest { @@ -44,12 +43,16 @@ impl SupportedArchitectureTest for X86ArchitectureTest { const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS; fn create(cli_options: ProcessedCli) -> Self { - let intrinsics = + let mut intrinsics = get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); + intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); + intrinsics.dedup(); + let sample_percentage: usize = cli_options.sample_percentage as usize; + let sample_size = (intrinsics.len() * sample_percentage) / 100; - let mut intrinsics = intrinsics + let intrinsics = intrinsics .into_iter() // Not sure how we would compare intrinsic that returns void. .filter(|i| i.results.kind() != TypeKind::Void) @@ -61,13 +64,9 @@ impl SupportedArchitectureTest for X86ArchitectureTest { .filter(|i| !i.arguments.iter().any(|a| a.is_ptr())) .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128)) .filter(|i| !cli_options.skip.contains(&i.name)) - .unique_by(|i| i.name.clone()) + .take(sample_size) .collect::>(); - let sample_size = (intrinsics.len() * sample_percentage) / 100; - intrinsics.truncate(sample_size); - - intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); Self { intrinsics: intrinsics, cli_options: cli_options, From d6180854223f4acd5318f52ab523293e7014f40d Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 26 Oct 2025 19:28:46 +0100 Subject: [PATCH 02/17] intrinsic-test: display more logs in CI --- library/stdarch/ci/intrinsic-test.sh | 6 +++--- .../stdarch/crates/intrinsic-test/src/common/mod.rs | 12 ++++++++---- library/stdarch/crates/intrinsic-test/src/main.rs | 2 +- library/stdarch/crates/intrinsic-test/src/x86/mod.rs | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/library/stdarch/ci/intrinsic-test.sh b/library/stdarch/ci/intrinsic-test.sh index be309f9e42f3e..be63f0c0c6178 100755 --- a/library/stdarch/ci/intrinsic-test.sh +++ b/library/stdarch/ci/intrinsic-test.sh @@ -85,7 +85,7 @@ esac # Arm specific case "${TARGET}" in aarch64-unknown-linux-gnu*|armv7-unknown-linux-gnueabihf*) - CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \ + CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=info \ cargo run "${INTRINSIC_TEST}" "${PROFILE}" \ --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \ --runner "${TEST_RUNNER}" \ @@ -96,7 +96,7 @@ case "${TARGET}" in ;; aarch64_be-unknown-linux-gnu*) - CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=warn \ + CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" RUST_LOG=info \ cargo run "${INTRINSIC_TEST}" "${PROFILE}" \ --bin intrinsic-test -- intrinsics_data/arm_intrinsics.json \ --runner "${TEST_RUNNER}" \ @@ -114,7 +114,7 @@ case "${TARGET}" in # Hence the use of `env -u`. env -u CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER \ CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" \ - RUST_LOG=warn RUST_BACKTRACE=1 \ + RUST_LOG=info RUST_BACKTRACE=1 \ cargo run "${INTRINSIC_TEST}" "${PROFILE}" \ --bin intrinsic-test -- intrinsics_data/x86-intel.xml \ --runner "${TEST_RUNNER}" \ diff --git a/library/stdarch/crates/intrinsic-test/src/common/mod.rs b/library/stdarch/crates/intrinsic-test/src/common/mod.rs index d8f06ae23885e..8b6bd943a7423 100644 --- a/library/stdarch/crates/intrinsic-test/src/common/mod.rs +++ b/library/stdarch/crates/intrinsic-test/src/common/mod.rs @@ -79,12 +79,16 @@ pub trait SupportedArchitectureTest { trace!("compiling mod_{i}.cpp"); if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { let compile_output = cpp_compiler - .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")); + .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")) + .map_err(|e| format!("Error compiling mod_{i}.cpp: {e:?}"))?; + + assert!( + compile_output.status.success(), + "{}", + String::from_utf8_lossy(&compile_output.stderr) + ); trace!("finished compiling mod_{i}.cpp"); - if let Err(compile_error) = compile_output { - return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}")); - } } Ok(()) }) diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs index ed3a50067dc4a..3580d80bd1127 100644 --- a/library/stdarch/crates/intrinsic-test/src/main.rs +++ b/library/stdarch/crates/intrinsic-test/src/main.rs @@ -34,7 +34,7 @@ fn run(test_environment: impl SupportedArchitectureTest) { if !test_environment.build_rust_file() { std::process::exit(3); } - info!("comaparing outputs"); + info!("comparing outputs"); if !test_environment.compare_outputs() { std::process::exit(1); } diff --git a/library/stdarch/crates/intrinsic-test/src/x86/mod.rs b/library/stdarch/crates/intrinsic-test/src/x86/mod.rs index 4adf85017bc13..f2baf070714c1 100644 --- a/library/stdarch/crates/intrinsic-test/src/x86/mod.rs +++ b/library/stdarch/crates/intrinsic-test/src/x86/mod.rs @@ -47,7 +47,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); - intrinsics.dedup(); + intrinsics.dedup_by(|a, b| a.name == b.name); let sample_percentage: usize = cli_options.sample_percentage as usize; let sample_size = (intrinsics.len() * sample_percentage) / 100; From 48116cf39d59782143172e8c3f86006b3a5bf12e Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 6 Apr 2025 19:40:15 +0530 Subject: [PATCH 03/17] Add AMX intrinsics --- library/stdarch/crates/core_arch/src/lib.rs | 3 +- .../crates/core_arch/src/x86_64/amx.rs | 224 ++++++++++++++++++ 2 files changed, 226 insertions(+), 1 deletion(-) diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs index 26a9cb5899183..dcd19186a1834 100644 --- a/library/stdarch/crates/core_arch/src/lib.rs +++ b/library/stdarch/crates/core_arch/src/lib.rs @@ -34,7 +34,8 @@ f16, aarch64_unstable_target_feature, bigint_helper_methods, - funnel_shifts + funnel_shifts, + avx10_target_feature )] #![cfg_attr(test, feature(test, abi_vectorcall, stdarch_internal))] #![deny(clippy::missing_inline_in_public_items)] diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs index 4b33c0ab6c155..6d896c4918d13 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/amx.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs @@ -1,3 +1,5 @@ +use crate::core_arch::{simd::*, x86::*}; + #[cfg(test)] use stdarch_test::assert_instr; @@ -242,6 +244,206 @@ pub unsafe fn _tile_cmmrlfp16ps() { tcmmrlfp16ps(DST as i8, A as i8, B as i8); } +/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and BF8 (8-bit E5M2) +/// floating-point elements in tile b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result +/// back to tile dst. +#[inline] +#[rustc_legacy_const_generics(0, 1, 2)] +#[target_feature(enable = "amx-fp8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tdpbf8ps, DST = 0, A = 1, B = 2) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_dpbf8ps() { + static_assert_uimm_bits!(DST, 3); + static_assert_uimm_bits!(A, 3); + static_assert_uimm_bits!(B, 3); + tdpbf8ps(DST as i8, A as i8, B as i8); +} + +/// Compute dot-product of BF8 (8-bit E5M2) floating-point elements in tile a and HF8 +/// (8-bit E4M3) floating-point elements in tile b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result +/// back to tile dst. +#[inline] +#[rustc_legacy_const_generics(0, 1, 2)] +#[target_feature(enable = "amx-fp8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tdpbhf8ps, DST = 0, A = 1, B = 2) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_dpbhf8ps() { + static_assert_uimm_bits!(DST, 3); + static_assert_uimm_bits!(A, 3); + static_assert_uimm_bits!(B, 3); + tdpbhf8ps(DST as i8, A as i8, B as i8); +} + +/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and BF8 +/// (8-bit E5M2) floating-point elements in tile b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result +/// back to tile dst. +#[inline] +#[rustc_legacy_const_generics(0, 1, 2)] +#[target_feature(enable = "amx-fp8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tdphbf8ps, DST = 0, A = 1, B = 2) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_dphbf8ps() { + static_assert_uimm_bits!(DST, 3); + static_assert_uimm_bits!(A, 3); + static_assert_uimm_bits!(B, 3); + tdphbf8ps(DST as i8, A as i8, B as i8); +} + +/// Compute dot-product of HF8 (8-bit E4M3) floating-point elements in tile a and HF8 (8-bit E4M3) +/// floating-point elements in tile b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in dst, and store the 32-bit result +/// back to tile dst. +#[inline] +#[rustc_legacy_const_generics(0, 1, 2)] +#[target_feature(enable = "amx-fp8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tdphf8ps, DST = 0, A = 1, B = 2) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_dphf8ps() { + static_assert_uimm_bits!(DST, 3); + static_assert_uimm_bits!(A, 3); + static_assert_uimm_bits!(B, 3); + tdphf8ps(DST as i8, A as i8, B as i8); +} + +/// Load tile rows from memory specified by base address and stride into destination tile dst +/// using the tile configuration previously configured via _tile_loadconfig. +/// Additionally, this intrinsic indicates the source memory location is likely to become +/// read-shared by multiple processors, i.e., read in the future by at least one other processor +/// before it is written, assuming it is ever written in the future. +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-movrs")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tileloaddrs, DST = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_loaddrs(base: *const u8, stride: usize) { + static_assert_uimm_bits!(DST, 3); + tileloaddrs64(DST as i8, base, stride); +} + +/// Load tile rows from memory specified by base address and stride into destination tile dst +/// using the tile configuration previously configured via _tile_loadconfig. +/// Provides a hint to the implementation that the data would be reused but does not need +/// to be resident in the nearest cache levels. +/// Additionally, this intrinsic indicates the source memory location is likely to become +/// read-shared by multiple processors, i.e., read in the future by at least one other processor +/// before it is written, assuming it is ever written in the future. +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-movrs")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tileloaddrst1, DST = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_stream_loaddrs(base: *const u8, stride: usize) { + static_assert_uimm_bits!(DST, 3); + tileloaddrst164(DST as i8, base, stride); +} + +/// Perform matrix multiplication of two tiles a and b, containing packed single precision (32-bit) +/// floating-point elements, which are converted to TF32 (tensor-float32) format, and accumulate the +/// results into a packed single precision tile. +/// For each possible combination of (row of a, column of b), it performs +/// - convert to TF32 +/// - multiply the corresponding elements of a and b +/// - accumulate the results into the corresponding row and column of dst using round-to-nearest-even +/// rounding mode. +/// Output FP32 denormals are always flushed to zero, input single precision denormals are always +/// handled and *not* treated as zero. +#[inline] +#[rustc_legacy_const_generics(0, 1, 2)] +#[target_feature(enable = "amx-tf32")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tmmultf32ps, DST = 0, A = 1, B = 2) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_mmultf32ps() { + static_assert_uimm_bits!(DST, 3); + static_assert_uimm_bits!(A, 3); + static_assert_uimm_bits!(B, 3); + tmmultf32ps(DST as i8, A as i8, B as i8); +} + +/// Moves a row from a tile register to a zmm register, converting the packed 32-bit signed integer +/// elements to packed single-precision (32-bit) floating-point elements. +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowd2ps, TILE = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowd2ps(row: u32) -> __m512 { + static_assert_uimm_bits!(TILE, 3); + tcvtrowd2ps(TILE as i8, row).as_m512() +} + +/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) +/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting +/// 16-bit elements are placed in the high 16-bits within each 32-bit element of the returned vector. +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowps2phh, TILE = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowps2phh(row: u32) -> __m512h { + static_assert_uimm_bits!(TILE, 3); + tcvtrowps2phh(TILE as i8, row).as_m512h() +} + +/// Moves a row from a tile register to a zmm register, converting the packed single-precision (32-bit) +/// floating-point elements to packed half-precision (16-bit) floating-point elements. The resulting +/// 16-bit elements are placed in the low 16-bits within each 32-bit element of the returned vector. +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tcvtrowps2phl, TILE = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_cvtrowps2phl(row: u32) -> __m512h { + static_assert_uimm_bits!(TILE, 3); + tcvtrowps2phl(TILE as i8, row).as_m512h() +} + +/// Moves one row of tile data into a zmm vector register +#[inline] +#[rustc_legacy_const_generics(0)] +#[target_feature(enable = "amx-avx512,avx10.2")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(tilemovrow, TILE = 0) +)] +#[unstable(feature = "x86_amx_intrinsics", issue = "126622")] +pub unsafe fn _tile_movrow(row: u32) -> __m512i { + static_assert_uimm_bits!(TILE, 3); + tilemovrow(TILE as i8, row).as_m512i() +} + #[allow(improper_ctypes)] unsafe extern "C" { #[link_name = "llvm.x86.ldtilecfg"] @@ -274,6 +476,28 @@ unsafe extern "C" { fn tcmmimfp16ps(dst: i8, a: i8, b: i8); #[link_name = "llvm.x86.tcmmrlfp16ps"] fn tcmmrlfp16ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tdpbf8ps"] + fn tdpbf8ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tdpbhf8ps"] + fn tdpbhf8ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tdphbf8ps"] + fn tdphbf8ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tdphf8ps"] + fn tdphf8ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tileloaddrs64"] + fn tileloaddrs64(dst: i8, base: *const u8, stride: usize); + #[link_name = "llvm.x86.tileloaddrst164"] + fn tileloaddrst164(dst: i8, base: *const u8, stride: usize); + #[link_name = "llvm.x86.tmmultf32ps"] + fn tmmultf32ps(dst: i8, a: i8, b: i8); + #[link_name = "llvm.x86.tcvtrowd2ps"] + fn tcvtrowd2ps(tile: i8, row: u32) -> f32x16; + #[link_name = "llvm.x86.tcvtrowps2phh"] + fn tcvtrowps2phh(tile: i8, row: u32) -> f16x32; + #[link_name = "llvm.x86.tcvtrowps2phl"] + fn tcvtrowps2phl(tile: i8, row: u32) -> f16x32; + #[link_name = "llvm.x86.tilemovrow"] + fn tilemovrow(tile: i8, row: u32) -> i32x16; } #[cfg(test)] From 28150236808936971807310430643a50398957bb Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 1 Nov 2025 07:51:10 +0530 Subject: [PATCH 04/17] Patch stdarch_verify to not check intel definition for new AMX intrinsics --- library/stdarch/crates/stdarch-verify/tests/x86-intel.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs index 5a98db980b23e..4136463f197fd 100644 --- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs +++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs @@ -304,6 +304,14 @@ fn verify_all_signatures() { if feature.contains("sse4a") || feature.contains("tbm") { continue; } + + // FIXME: these have not been added to Intrinsics Guide yet + if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32"] + .iter() + .any(|f| feature.contains(f)) + { + continue; + } } let intel = match map.remove(rust.name) { From 17c3f8ab5e52e1c02779383c69fdc34d1b28ef82 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 1 Nov 2025 07:50:36 +0530 Subject: [PATCH 05/17] Add tests for new AMX intrinsics --- .../crates/core_arch/src/x86_64/amx.rs | 228 +++++++++++++++++- 1 file changed, 227 insertions(+), 1 deletion(-) diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs index 6d896c4918d13..c87514980df6f 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/amx.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs @@ -504,7 +504,7 @@ unsafe extern "C" { mod tests { use crate::core_arch::x86::_mm_cvtness_sbh; use crate::core_arch::x86_64::*; - use core::mem::transmute; + use core::{array, mem::transmute}; use stdarch_test::simd_test; #[cfg(target_os = "linux")] use syscalls::{Sysno, syscall}; @@ -843,4 +843,230 @@ mod tests { _tile_release(); assert_eq!(res, [[0f32; 16]; 16]); } + + const BF8_ONE: u8 = 0x3c; + const BF8_TWO: u8 = 0x40; + const HF8_ONE: u8 = 0x38; + const HF8_TWO: u8 = 0x40; + + #[simd_test(enable = "amx-fp8")] + unsafe fn test_tile_dpbf8ps() { + _init_amx(); + let ones = [BF8_ONE; 1024]; + let twos = [BF8_TWO; 1024]; + let mut res = [[0.0_f32; 16]; 16]; + let mut config = __tilecfg::default(); + config.palette = 1; + (0..=2).for_each(|i| { + config.colsb[i] = 64; + config.rows[i] = 16; + }); + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + _tile_loadd::<1>(&ones as *const u8, 64); + _tile_loadd::<2>(&twos as *const u8, 64); + _tile_dpbf8ps::<0, 1, 2>(); + _tile_stored::<0>(res.as_mut_ptr().cast(), 64); + _tile_release(); + assert_eq!(res, [[128.0_f32; 16]; 16]); + } + + #[simd_test(enable = "amx-fp8")] + unsafe fn test_tile_dpbhf8ps() { + _init_amx(); + let ones = [BF8_ONE; 1024]; + let twos = [HF8_TWO; 1024]; + let mut res = [[0.0_f32; 16]; 16]; + let mut config = __tilecfg::default(); + config.palette = 1; + (0..=2).for_each(|i| { + config.colsb[i] = 64; + config.rows[i] = 16; + }); + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + _tile_loadd::<1>(&ones as *const u8, 64); + _tile_loadd::<2>(&twos as *const u8, 64); + _tile_dpbhf8ps::<0, 1, 2>(); + _tile_stored::<0>(res.as_mut_ptr().cast(), 64); + _tile_release(); + assert_eq!(res, [[128.0_f32; 16]; 16]); + } + + #[simd_test(enable = "amx-fp8")] + unsafe fn test_tile_dphbf8ps() { + _init_amx(); + let ones = [HF8_ONE; 1024]; + let twos = [BF8_TWO; 1024]; + let mut res = [[0.0_f32; 16]; 16]; + let mut config = __tilecfg::default(); + config.palette = 1; + (0..=2).for_each(|i| { + config.colsb[i] = 64; + config.rows[i] = 16; + }); + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + _tile_loadd::<1>(&ones as *const u8, 64); + _tile_loadd::<2>(&twos as *const u8, 64); + _tile_dphbf8ps::<0, 1, 2>(); + _tile_stored::<0>(res.as_mut_ptr().cast(), 64); + _tile_release(); + assert_eq!(res, [[128.0_f32; 16]; 16]); + } + + #[simd_test(enable = "amx-fp8")] + unsafe fn test_tile_dphf8ps() { + _init_amx(); + let ones = [HF8_ONE; 1024]; + let twos = [HF8_TWO; 1024]; + let mut res = [[0.0_f32; 16]; 16]; + let mut config = __tilecfg::default(); + config.palette = 1; + (0..=2).for_each(|i| { + config.colsb[i] = 64; + config.rows[i] = 16; + }); + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + _tile_loadd::<1>(&ones as *const u8, 64); + _tile_loadd::<2>(&twos as *const u8, 64); + _tile_dphf8ps::<0, 1, 2>(); + _tile_stored::<0>(res.as_mut_ptr().cast(), 64); + _tile_release(); + assert_eq!(res, [[128.0_f32; 16]; 16]); + } + + #[simd_test(enable = "amx-tile")] + unsafe fn test_tile_loaddrs() { + _init_amx(); + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + let mat = [1_i8; 1024]; + _tile_loaddrs::<0>(&mat as *const i8 as *const u8, 64); + let mut out = [[0_i8; 64]; 16]; + _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); + _tile_release(); + assert_eq!(out, [[1; 64]; 16]); + } + + #[simd_test(enable = "amx-tile")] + unsafe fn test_tile_stream_loaddrs() { + _init_amx(); + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + let mat = [1_i8; 1024]; + _tile_stream_loaddrs::<0>(&mat as *const i8 as *const u8, 64); + let mut out = [[0_i8; 64]; 16]; + _tile_stored::<0>(&mut out as *mut [i8; 64] as *mut u8, 64); + _tile_release(); + assert_eq!(out, [[1; 64]; 16]); + } + + #[simd_test(enable = "amx-avx512,avx10.2")] + unsafe fn test_tile_movrow() { + _init_amx(); + let array: [[u8; 64]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = _tile_movrow::<0>(i); + assert_eq!(*row.as_u8x64().as_array(), [i as _; _]); + } + } + + #[simd_test(enable = "amx-avx512,avx10.2")] + unsafe fn test_tile_cvtrowd2ps() { + _init_amx(); + let array: [[u32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = _tile_cvtrowd2ps::<0>(i); + assert_eq!(*row.as_f32x16().as_array(), [i as _; _]); + } + } + + #[simd_test(enable = "amx-avx512,avx10.2")] + unsafe fn test_tile_cvtrowps2phh() { + _init_amx(); + let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = _tile_cvtrowps2phh::<0>(i); + assert_eq!( + *row.as_f16x32().as_array(), + array::from_fn(|j| if j & 1 == 0 { 0.0 } else { i as _ }) + ); + } + } + + #[simd_test(enable = "amx-avx512,avx10.2")] + unsafe fn test_tile_cvtrowps2phl() { + _init_amx(); + let array: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]); + + let mut config = __tilecfg::default(); + config.palette = 1; + config.colsb[0] = 64; + config.rows[0] = 16; + _tile_loadconfig(config.as_ptr()); + _tile_loadd::<0>(array.as_ptr().cast(), 64); + for i in 0..16 { + let row = _tile_cvtrowps2phl::<0>(i); + assert_eq!( + *row.as_f16x32().as_array(), + array::from_fn(|j| if j & 1 == 0 { i as _ } else { 0.0 }) + ); + } + } + + #[simd_test(enable = "amx-tf32")] + unsafe fn test_tile_mmultf32ps() { + _init_amx(); + let a: [[f32; 16]; 16] = array::from_fn(|i| [i as _; _]); + let b: [[f32; 16]; 16] = [array::from_fn(|j| j as _); _]; + let mut res = [[0.0; 16]; 16]; + + let mut config = __tilecfg::default(); + config.palette = 1; + (0..=2).for_each(|i| { + config.colsb[i] = 64; + config.rows[i] = 16; + }); + _tile_loadconfig(config.as_ptr()); + _tile_zero::<0>(); + _tile_loadd::<1>(a.as_ptr().cast(), 64); + _tile_loadd::<2>(b.as_ptr().cast(), 64); + _tile_mmultf32ps::<0, 1, 2>(); + _tile_stored::<0>(res.as_mut_ptr().cast(), 64); + _tile_release(); + + let expected = array::from_fn(|i| array::from_fn(|j| 16.0 * i as f32 * j as f32)); + assert_eq!(res, expected); + } } From f9dc790aa5e636165a2f730de94f06eec28b710a Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 2 Nov 2025 20:09:54 +0100 Subject: [PATCH 06/17] improve `_mm256_permute2f128` tests --- .../stdarch/crates/core_arch/src/x86/avx.rs | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index c2c2febf18291..7ea5f1f4ff416 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -3928,28 +3928,43 @@ mod tests { #[simd_test(enable = "avx")] unsafe fn test_mm256_permute2f128_ps() { - let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); - let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); - let r = _mm256_permute2f128_ps::<0x13>(a, b); - let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.); + let a = _mm256_setr_ps(11., 12., 13., 14., 15., 16., 17., 18.); + let b = _mm256_setr_ps(21., 22., 23., 24., 25., 26., 27., 28.); + let r = _mm256_permute2f128_ps::<0b0001_0011>(a, b); + let e = _mm256_setr_ps(25., 26., 27., 28., 15., 16., 17., 18.); assert_eq_m256(r, e); + + // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field. + let r = _mm256_permute2f128_ps::<0b1001_1011>(a, b); + let z = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0); + assert_eq_m256(r, z); } #[simd_test(enable = "avx")] unsafe fn test_mm256_permute2f128_pd() { let a = _mm256_setr_pd(1., 2., 3., 4.); let b = _mm256_setr_pd(5., 6., 7., 8.); - let r = _mm256_permute2f128_pd::<0x31>(a, b); + let r = _mm256_permute2f128_pd::<0b0011_0001>(a, b); let e = _mm256_setr_pd(3., 4., 7., 8.); assert_eq_m256d(r, e); + + // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field. + let r = _mm256_permute2f128_pd::<0b1011_1001>(a, b); + let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0); + assert_eq_m256d(r, e); } #[simd_test(enable = "avx")] unsafe fn test_mm256_permute2f128_si256() { - let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4); - let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8); - let r = _mm256_permute2f128_si256::<0x20>(a, b); - let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let a = _mm256_setr_epi32(11, 12, 13, 14, 15, 16, 17, 18); + let b = _mm256_setr_epi32(21, 22, 23, 24, 25, 26, 27, 28); + let r = _mm256_permute2f128_si256::<0b0010_0000>(a, b); + let e = _mm256_setr_epi32(11, 12, 13, 14, 21, 22, 23, 24); + assert_eq_m256i(r, e); + + // Setting bits 3 or 7 (zero-indexed) zeroes the corresponding field. + let r = _mm256_permute2f128_si256::<0b1010_1000>(a, b); + let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m256i(r, e); } From 91261454192047560c81ab20068bdbd015b427a6 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sat, 11 Oct 2025 01:34:25 +0530 Subject: [PATCH 07/17] Use generic SIMD masked load/stores for avx512 masked load/stores --- .../stdarch/crates/core_arch/src/macros.rs | 14 + .../crates/core_arch/src/x86/avx512bw.rs | 63 ++--- .../crates/core_arch/src/x86/avx512f.rs | 244 +++++++----------- 3 files changed, 134 insertions(+), 187 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs index e00b43353679e..1030d7e9740ac 100644 --- a/library/stdarch/crates/core_arch/src/macros.rs +++ b/library/stdarch/crates/core_arch/src/macros.rs @@ -163,3 +163,17 @@ macro_rules! simd_extract { ($x:expr, $idx:expr $(,)?) => {{ $crate::intrinsics::simd::simd_extract($x, const { $idx }) }}; ($x:expr, $idx:expr, $ty:ty $(,)?) => {{ $crate::intrinsics::simd::simd_extract::<_, $ty>($x, const { $idx }) }}; } + +#[allow(unused)] +macro_rules! simd_masked_load { + ($align:expr, $mask:expr, $ptr:expr, $default:expr) => { + $crate::intrinsics::simd::simd_masked_load::<_, _, _, { $align }>($mask, $ptr, $default) + }; +} + +#[allow(unused)] +macro_rules! simd_masked_store { + ($align:expr, $mask:expr, $ptr:expr, $default:expr) => { + $crate::intrinsics::simd::simd_masked_store::<_, _, _, { $align }>($mask, $ptr, $default) + }; +} diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index fadc0e2cc09bd..72842f4546754 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -5609,7 +5609,8 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i { - transmute(loaddqu16_512(mem_addr, src.as_i16x32(), k)) + let mask = simd_select_bitmask(k, i16x32::splat(!0), i16x32::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i16x32()).as_m512i() } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5635,7 +5636,8 @@ pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __ #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i { - transmute(loaddqu8_512(mem_addr, src.as_i8x64(), k)) + let mask = simd_select_bitmask(k, i8x64::splat(!0), i8x64::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i8x64()).as_m512i() } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5661,7 +5663,8 @@ pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m5 #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i { - transmute(loaddqu16_256(mem_addr, src.as_i16x16(), k)) + let mask = simd_select_bitmask(k, i16x16::splat(!0), i16x16::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i16x16()).as_m256i() } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5687,7 +5690,8 @@ pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __ #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i { - transmute(loaddqu8_256(mem_addr, src.as_i8x32(), k)) + let mask = simd_select_bitmask(k, i8x32::splat(!0), i8x32::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i8x32()).as_m256i() } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5713,7 +5717,8 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2 #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { - transmute(loaddqu16_128(mem_addr, src.as_i16x8(), k)) + let mask = simd_select_bitmask(k, i16x8::splat(!0), i16x8::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i16x8()).as_m128i() } /// Load packed 16-bit integers from memory into dst using zeromask k @@ -5739,7 +5744,8 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128 #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { - transmute(loaddqu8_128(mem_addr, src.as_i8x16(), k)) + let mask = simd_select_bitmask(k, i8x16::splat(!0), i8x16::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i8x16()).as_m128i() } /// Load packed 8-bit integers from memory into dst using zeromask k @@ -5764,7 +5770,8 @@ pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) { - storedqu16_512(mem_addr, a.as_i16x32(), mask) + let mask = simd_select_bitmask(mask, i16x32::splat(!0), i16x32::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i16x32()); } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5776,7 +5783,8 @@ pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: _ #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) { - storedqu8_512(mem_addr, a.as_i8x64(), mask) + let mask = simd_select_bitmask(mask, i8x64::splat(!0), i8x64::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i8x64()); } /// Store packed 16-bit integers from a into memory using writemask k. @@ -5788,7 +5796,8 @@ pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) { - storedqu16_256(mem_addr, a.as_i16x16(), mask) + let mask = simd_select_bitmask(mask, i16x16::splat(!0), i16x16::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i16x16()); } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5800,7 +5809,8 @@ pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: _ #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) { - storedqu8_256(mem_addr, a.as_i8x32(), mask) + let mask = simd_select_bitmask(mask, i8x32::splat(!0), i8x32::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i8x32()); } /// Store packed 16-bit integers from a into memory using writemask k. @@ -5812,7 +5822,8 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m #[cfg_attr(test, assert_instr(vmovdqu16))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { - storedqu16_128(mem_addr, a.as_i16x8(), mask) + let mask = simd_select_bitmask(mask, i16x8::splat(!0), i16x8::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i16x8()); } /// Store packed 8-bit integers from a into memory using writemask k. @@ -5824,7 +5835,8 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12 #[cfg_attr(test, assert_instr(vmovdqu8))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { - storedqu8_128(mem_addr, a.as_i8x16(), mask) + let mask = simd_select_bitmask(mask, i8x16::splat(!0), i8x16::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i8x16()); } /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst. @@ -11733,33 +11745,6 @@ unsafe extern "C" { fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16); #[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"] fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8); - - #[link_name = "llvm.x86.avx512.mask.loadu.b.128"] - fn loaddqu8_128(mem_addr: *const i8, a: i8x16, mask: u16) -> i8x16; - #[link_name = "llvm.x86.avx512.mask.loadu.w.128"] - fn loaddqu16_128(mem_addr: *const i16, a: i16x8, mask: u8) -> i16x8; - #[link_name = "llvm.x86.avx512.mask.loadu.b.256"] - fn loaddqu8_256(mem_addr: *const i8, a: i8x32, mask: u32) -> i8x32; - #[link_name = "llvm.x86.avx512.mask.loadu.w.256"] - fn loaddqu16_256(mem_addr: *const i16, a: i16x16, mask: u16) -> i16x16; - #[link_name = "llvm.x86.avx512.mask.loadu.b.512"] - fn loaddqu8_512(mem_addr: *const i8, a: i8x64, mask: u64) -> i8x64; - #[link_name = "llvm.x86.avx512.mask.loadu.w.512"] - fn loaddqu16_512(mem_addr: *const i16, a: i16x32, mask: u32) -> i16x32; - - #[link_name = "llvm.x86.avx512.mask.storeu.b.128"] - fn storedqu8_128(mem_addr: *mut i8, a: i8x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.storeu.w.128"] - fn storedqu16_128(mem_addr: *mut i16, a: i16x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.b.256"] - fn storedqu8_256(mem_addr: *mut i8, a: i8x32, mask: u32); - #[link_name = "llvm.x86.avx512.mask.storeu.w.256"] - fn storedqu16_256(mem_addr: *mut i16, a: i16x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.storeu.b.512"] - fn storedqu8_512(mem_addr: *mut i8, a: i8x64, mask: u64); - #[link_name = "llvm.x86.avx512.mask.storeu.w.512"] - fn storedqu16_512(mem_addr: *mut i16, a: i16x32, mask: u32); - } #[cfg(test)] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index b60df7dbc9a3e..d39b741f00dac 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -34715,7 +34715,8 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { - transmute(loaddqu32_512(mem_addr, src.as_i32x16(), k)) + let mask = simd_select_bitmask(k, i32x16::splat(!0), i32x16::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i32x16()).as_m512i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34741,7 +34742,8 @@ pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __ #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { - transmute(loaddqu64_512(mem_addr, src.as_i64x8(), k)) + let mask = simd_select_bitmask(k, i64x8::splat(!0), i64x8::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i64x8()).as_m512i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34767,7 +34769,8 @@ pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { - transmute(loadups_512(mem_addr, src.as_f32x16(), k)) + let mask = simd_select_bitmask(k, i32x16::splat(!0), i32x16::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f32x16()).as_m512() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34793,7 +34796,8 @@ pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m51 #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { - transmute(loadupd_512(mem_addr, src.as_f64x8(), k)) + let mask = simd_select_bitmask(k, i64x8::splat(!0), i64x8::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f64x8()).as_m512d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34819,7 +34823,8 @@ pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512 #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { - transmute(loaddqu32_256(mem_addr, src.as_i32x8(), k)) + let mask = simd_select_bitmask(k, i32x8::splat(!0), i32x8::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i32x8()).as_m256i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34845,7 +34850,8 @@ pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { - transmute(loaddqu64_256(mem_addr, src.as_i64x4(), k)) + let mask = simd_select_bitmask(k, i64x4::splat(!0), i64x4::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i64x4()).as_m256i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34871,7 +34877,8 @@ pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - transmute(loadups_256(mem_addr, src.as_f32x8(), k)) + let mask = simd_select_bitmask(k, i32x8::splat(!0), i32x8::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f32x8()).as_m256() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -34897,7 +34904,8 @@ pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { - transmute(loadupd_256(mem_addr, src.as_f64x4(), k)) + let mask = simd_select_bitmask(k, i64x4::splat(!0), i64x4::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f64x4()).as_m256d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -34923,7 +34931,8 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256 #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { - transmute(loaddqu32_128(mem_addr, src.as_i32x4(), k)) + let mask = simd_select_bitmask(k, i32x4::splat(!0), i32x4::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i32x4()).as_m128i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -34949,7 +34958,8 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128 #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { - transmute(loaddqu64_128(mem_addr, src.as_i64x2(), k)) + let mask = simd_select_bitmask(k, i64x2::splat(!0), i64x2::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_i64x2()).as_m128i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -34975,7 +34985,8 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128 #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - transmute(loadups_128(mem_addr, src.as_f32x4(), k)) + let mask = simd_select_bitmask(k, i32x4::splat(!0), i32x4::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f32x4()).as_m128() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -35001,7 +35012,8 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - transmute(loadupd_128(mem_addr, src.as_f64x2(), k)) + let mask = simd_select_bitmask(k, i64x2::splat(!0), i64x2::ZERO); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, src.as_f64x2()).as_m128d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -35027,7 +35039,8 @@ pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { - transmute(loaddqa32_512(mem_addr, src.as_i32x16(), k)) + let mask = simd_select_bitmask(k, i32x16::splat(!0), i32x16::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i32x16()).as_m512i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -35053,7 +35066,8 @@ pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { - transmute(loaddqa64_512(mem_addr, src.as_i64x8(), k)) + let mask = simd_select_bitmask(k, i64x8::splat(!0), i64x8::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i64x8()).as_m512i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -35079,7 +35093,8 @@ pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m5 #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { - transmute(loadaps_512(mem_addr, src.as_f32x16(), k)) + let mask = simd_select_bitmask(k, i32x16::splat(!0), i32x16::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f32x16()).as_m512() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -35105,7 +35120,8 @@ pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { - transmute(loadapd_512(mem_addr, src.as_f64x8(), k)) + let mask = simd_select_bitmask(k, i64x8::splat(!0), i64x8::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f64x8()).as_m512d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -35131,7 +35147,8 @@ pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { - transmute(loaddqa32_256(mem_addr, src.as_i32x8(), k)) + let mask = simd_select_bitmask(k, i32x8::splat(!0), i32x8::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i32x8()).as_m256i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -35157,7 +35174,8 @@ pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m2 #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { - transmute(loaddqa64_256(mem_addr, src.as_i64x4(), k)) + let mask = simd_select_bitmask(k, i64x4::splat(!0), i64x4::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i64x4()).as_m256i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -35183,7 +35201,8 @@ pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m2 #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { - transmute(loadaps_256(mem_addr, src.as_f32x8(), k)) + let mask = simd_select_bitmask(k, i32x8::splat(!0), i32x8::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f32x8()).as_m256() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -35209,7 +35228,8 @@ pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { - transmute(loadapd_256(mem_addr, src.as_f64x4(), k)) + let mask = simd_select_bitmask(k, i64x4::splat(!0), i64x4::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f64x4()).as_m256d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -35235,7 +35255,8 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { - transmute(loaddqa32_128(mem_addr, src.as_i32x4(), k)) + let mask = simd_select_bitmask(k, i32x4::splat(!0), i32x4::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i32x4()).as_m128i() } /// Load packed 32-bit integers from memory into dst using zeromask k @@ -35261,7 +35282,8 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { - transmute(loaddqa64_128(mem_addr, src.as_i64x2(), k)) + let mask = simd_select_bitmask(k, i64x2::splat(!0), i64x2::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_i64x2()).as_m128i() } /// Load packed 64-bit integers from memory into dst using zeromask k @@ -35287,7 +35309,8 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { - transmute(loadaps_128(mem_addr, src.as_f32x4(), k)) + let mask = simd_select_bitmask(k, i32x4::splat(!0), i32x4::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f32x4()).as_m128() } /// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k @@ -35313,7 +35336,8 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { - transmute(loadapd_128(mem_addr, src.as_f64x2(), k)) + let mask = simd_select_bitmask(k, i64x2::splat(!0), i64x2::ZERO); + simd_masked_load!(SimdAlign::Vector, mask, mem_addr, src.as_f64x2()).as_m128d() } /// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k @@ -35426,7 +35450,8 @@ pub unsafe fn _mm_maskz_load_sd(k: __mmask8, mem_addr: *const f64) -> __m128d { #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { - storedqu32_512(mem_addr, a.as_i32x16(), mask) + let mask = simd_select_bitmask(mask, i32x16::splat(!0), i32x16::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x16()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35438,7 +35463,8 @@ pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: _ #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { - storedqu64_512(mem_addr, a.as_i64x8(), mask) + let mask = simd_select_bitmask(mask, i64x8::splat(!0), i64x8::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x8()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35450,7 +35476,8 @@ pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { - storeups_512(mem_addr, a.as_f32x16(), mask) + let mask = simd_select_bitmask(mask, i32x16::splat(!0), i32x16::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f32x16()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35462,7 +35489,8 @@ pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m5 #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { - storeupd_512(mem_addr, a.as_f64x8(), mask) + let mask = simd_select_bitmask(mask, i64x8::splat(!0), i64x8::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f64x8()); } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35474,7 +35502,8 @@ pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m51 #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { - storedqu32_256(mem_addr, a.as_i32x8(), mask) + let mask = simd_select_bitmask(mask, i32x8::splat(!0), i32x8::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35486,7 +35515,8 @@ pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __ #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { - storedqu64_256(mem_addr, a.as_i64x4(), mask) + let mask = simd_select_bitmask(mask, i64x4::splat(!0), i64x4::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35498,7 +35528,8 @@ pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __ #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { - storeups_256(mem_addr, a.as_f32x8(), mask) + let mask = simd_select_bitmask(mask, i32x8::splat(!0), i32x8::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f32x8()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35510,7 +35541,8 @@ pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m25 #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { - storeupd_256(mem_addr, a.as_f64x4(), mask) + let mask = simd_select_bitmask(mask, i64x4::splat(!0), i64x4::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f64x4()); } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35522,7 +35554,8 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25 #[cfg_attr(test, assert_instr(vmovdqu32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { - storedqu32_128(mem_addr, a.as_i32x4(), mask) + let mask = simd_select_bitmask(mask, i32x4::splat(!0), i32x4::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35534,7 +35567,8 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12 #[cfg_attr(test, assert_instr(vmovdqu64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { - storedqu64_128(mem_addr, a.as_i64x2(), mask) + let mask = simd_select_bitmask(mask, i64x2::splat(!0), i64x2::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35546,7 +35580,8 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12 #[cfg_attr(test, assert_instr(vmovups))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { - storeups_128(mem_addr, a.as_f32x4(), mask) + let mask = simd_select_bitmask(mask, i32x4::splat(!0), i32x4::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f32x4()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35558,7 +35593,8 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) #[cfg_attr(test, assert_instr(vmovupd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { - storeupd_128(mem_addr, a.as_f64x2(), mask) + let mask = simd_select_bitmask(mask, i64x2::splat(!0), i64x2::ZERO); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_f64x2()); } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35570,7 +35606,8 @@ pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { - storedqa32_512(mem_addr, a.as_i32x16(), mask) + let mask = simd_select_bitmask(mask, i32x16::splat(!0), i32x16::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i32x16()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35582,7 +35619,8 @@ pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __ #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { - storedqa64_512(mem_addr, a.as_i64x8(), mask) + let mask = simd_select_bitmask(mask, i64x8::splat(!0), i64x8::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i64x8()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35594,7 +35632,8 @@ pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { - storeaps_512(mem_addr, a.as_f32x16(), mask) + let mask = simd_select_bitmask(mask, i32x16::splat(!0), i32x16::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f32x16()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35606,7 +35645,8 @@ pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m51 #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { - storeapd_512(mem_addr, a.as_f64x8(), mask) + let mask = simd_select_bitmask(mask, i64x8::splat(!0), i64x8::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f64x8()); } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35618,7 +35658,8 @@ pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512 #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { - storedqa32_256(mem_addr, a.as_i32x8(), mask) + let mask = simd_select_bitmask(mask, i32x8::splat(!0), i32x8::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i32x8()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35630,7 +35671,8 @@ pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { - storedqa64_256(mem_addr, a.as_i64x4(), mask) + let mask = simd_select_bitmask(mask, i64x4::splat(!0), i64x4::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i64x4()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35642,7 +35684,8 @@ pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { - storeaps_256(mem_addr, a.as_f32x8(), mask) + let mask = simd_select_bitmask(mask, i32x8::splat(!0), i32x8::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f32x8()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35654,7 +35697,8 @@ pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256 #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { - storeapd_256(mem_addr, a.as_f64x4(), mask) + let mask = simd_select_bitmask(mask, i64x4::splat(!0), i64x4::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f64x4()); } /// Store packed 32-bit integers from a into memory using writemask k. @@ -35666,7 +35710,8 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256 #[cfg_attr(test, assert_instr(vmovdqa32))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { - storedqa32_128(mem_addr, a.as_i32x4(), mask) + let mask = simd_select_bitmask(mask, i32x4::splat(!0), i32x4::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i32x4()); } /// Store packed 64-bit integers from a into memory using writemask k. @@ -35678,7 +35723,8 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128 #[cfg_attr(test, assert_instr(vmovdqa64))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { - storedqa64_128(mem_addr, a.as_i64x2(), mask) + let mask = simd_select_bitmask(mask, i64x2::splat(!0), i64x2::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_i64x2()); } /// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. @@ -35690,7 +35736,8 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128 #[cfg_attr(test, assert_instr(vmovaps))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { - storeaps_128(mem_addr, a.as_f32x4(), mask) + let mask = simd_select_bitmask(mask, i32x4::splat(!0), i32x4::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f32x4()); } /// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. @@ -35702,7 +35749,8 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { #[cfg_attr(test, assert_instr(vmovapd))] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { - storeapd_128(mem_addr, a.as_f64x2(), mask) + let mask = simd_select_bitmask(mask, i64x2::splat(!0), i64x2::ZERO); + simd_masked_store!(SimdAlign::Vector, mask, mem_addr, a.as_f64x2()); } /// Store a single-precision (32-bit) floating-point element from a into memory using writemask k. mem_addr @@ -43109,106 +43157,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.vcomi.sd"] fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32; - #[link_name = "llvm.x86.avx512.mask.loadu.d.128"] - fn loaddqu32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; - #[link_name = "llvm.x86.avx512.mask.loadu.q.128"] - fn loaddqu64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2; - #[link_name = "llvm.x86.avx512.mask.loadu.ps.128"] - fn loadups_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4; - #[link_name = "llvm.x86.avx512.mask.loadu.pd.128"] - fn loadupd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2; - #[link_name = "llvm.x86.avx512.mask.loadu.d.256"] - fn loaddqu32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.loadu.q.256"] - fn loaddqu64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.loadu.ps.256"] - fn loadups_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8; - #[link_name = "llvm.x86.avx512.mask.loadu.pd.256"] - fn loadupd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4; - #[link_name = "llvm.x86.avx512.mask.loadu.d.512"] - fn loaddqu32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.loadu.q.512"] - fn loaddqu64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.loadu.ps.512"] - fn loadups_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16; - #[link_name = "llvm.x86.avx512.mask.loadu.pd.512"] - fn loadupd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8; - - #[link_name = "llvm.x86.avx512.mask.load.d.128"] - fn loaddqa32_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; - #[link_name = "llvm.x86.avx512.mask.load.q.128"] - fn loaddqa64_128(mem_addr: *const i64, a: i64x2, mask: u8) -> i64x2; - #[link_name = "llvm.x86.avx512.mask.load.ps.128"] - fn loadaps_128(mem_addr: *const f32, a: f32x4, mask: u8) -> f32x4; - #[link_name = "llvm.x86.avx512.mask.load.pd.128"] - fn loadapd_128(mem_addr: *const f64, a: f64x2, mask: u8) -> f64x2; - #[link_name = "llvm.x86.avx512.mask.load.d.256"] - fn loaddqa32_256(mem_addr: *const i32, a: i32x8, mask: u8) -> i32x8; - #[link_name = "llvm.x86.avx512.mask.load.q.256"] - fn loaddqa64_256(mem_addr: *const i64, a: i64x4, mask: u8) -> i64x4; - #[link_name = "llvm.x86.avx512.mask.load.ps.256"] - fn loadaps_256(mem_addr: *const f32, a: f32x8, mask: u8) -> f32x8; - #[link_name = "llvm.x86.avx512.mask.load.pd.256"] - fn loadapd_256(mem_addr: *const f64, a: f64x4, mask: u8) -> f64x4; - #[link_name = "llvm.x86.avx512.mask.load.d.512"] - fn loaddqa32_512(mem_addr: *const i32, a: i32x16, mask: u16) -> i32x16; - #[link_name = "llvm.x86.avx512.mask.load.q.512"] - fn loaddqa64_512(mem_addr: *const i64, a: i64x8, mask: u8) -> i64x8; - #[link_name = "llvm.x86.avx512.mask.load.ps.512"] - fn loadaps_512(mem_addr: *const f32, a: f32x16, mask: u16) -> f32x16; - #[link_name = "llvm.x86.avx512.mask.load.pd.512"] - fn loadapd_512(mem_addr: *const f64, a: f64x8, mask: u8) -> f64x8; - - #[link_name = "llvm.x86.avx512.mask.storeu.d.128"] - fn storedqu32_128(mem_addr: *mut i32, a: i32x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.q.128"] - fn storedqu64_128(mem_addr: *mut i64, a: i64x2, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.ps.128"] - fn storeups_128(mem_addr: *mut f32, a: f32x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.pd.128"] - fn storeupd_128(mem_addr: *mut f64, a: f64x2, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.d.256"] - fn storedqu32_256(mem_addr: *mut i32, a: i32x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.q.256"] - fn storedqu64_256(mem_addr: *mut i64, a: i64x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.ps.256"] - fn storeups_256(mem_addr: *mut f32, a: f32x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.pd.256"] - fn storeupd_256(mem_addr: *mut f64, a: f64x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.d.512"] - fn storedqu32_512(mem_addr: *mut i32, a: i32x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.storeu.q.512"] - fn storedqu64_512(mem_addr: *mut i64, a: i64x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.storeu.ps.512"] - fn storeups_512(mem_addr: *mut f32, a: f32x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.storeu.pd.512"] - fn storeupd_512(mem_addr: *mut f64, a: f64x8, mask: u8); - - #[link_name = "llvm.x86.avx512.mask.store.d.128"] - fn storedqa32_128(mem_addr: *mut i32, a: i32x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.q.128"] - fn storedqa64_128(mem_addr: *mut i64, a: i64x2, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.ps.128"] - fn storeaps_128(mem_addr: *mut f32, a: f32x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.pd.128"] - fn storeapd_128(mem_addr: *mut f64, a: f64x2, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.d.256"] - fn storedqa32_256(mem_addr: *mut i32, a: i32x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.q.256"] - fn storedqa64_256(mem_addr: *mut i64, a: i64x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.ps.256"] - fn storeaps_256(mem_addr: *mut f32, a: f32x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.pd.256"] - fn storeapd_256(mem_addr: *mut f64, a: f64x4, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.d.512"] - fn storedqa32_512(mem_addr: *mut i32, a: i32x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.store.q.512"] - fn storedqa64_512(mem_addr: *mut i64, a: i64x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.store.ps.512"] - fn storeaps_512(mem_addr: *mut f32, a: f32x16, mask: u16); - #[link_name = "llvm.x86.avx512.mask.store.pd.512"] - fn storeapd_512(mem_addr: *mut f64, a: f64x8, mask: u8); - #[link_name = "llvm.x86.avx512.mask.expand.load.d.128"] fn expandloadd_128(mem_addr: *const i32, a: i32x4, mask: u8) -> i32x4; #[link_name = "llvm.x86.avx512.mask.expand.load.q.128"] From 7ea8483696cbb92d7b620ce46197f6013e6b70e1 Mon Sep 17 00:00:00 2001 From: sayantn Date: Thu, 6 Nov 2025 06:26:16 +0530 Subject: [PATCH 08/17] Use generic SIMD intrinsics for AVX `maskload` and `maskstore` intrinsics --- .../stdarch/crates/core_arch/src/x86/avx.rs | 40 ++++++++----------- .../stdarch/crates/core_arch/src/x86/avx2.rs | 40 ++++++++----------- 2 files changed, 32 insertions(+), 48 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index 7ea5f1f4ff416..c50c83fcaa8f4 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -1675,7 +1675,8 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d { - maskloadpd256(mem_addr as *const i8, mask.as_i64x4()) + let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_pd()) } /// Stores packed double-precision (64-bit) floating-point elements from `a` @@ -1687,7 +1688,8 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) { - maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a); + let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a) } /// Loads packed double-precision (64-bit) floating-point elements from memory @@ -1700,7 +1702,8 @@ pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d { - maskloadpd(mem_addr as *const i8, mask.as_i64x2()) + let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_pd()) } /// Stores packed double-precision (64-bit) floating-point elements from `a` @@ -1712,7 +1715,8 @@ pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d { #[cfg_attr(test, assert_instr(vmaskmovpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { - maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a); + let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a) } /// Loads packed single-precision (32-bit) floating-point elements from memory @@ -1725,7 +1729,8 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 { - maskloadps256(mem_addr as *const i8, mask.as_i32x8()) + let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm256_setzero_ps()) } /// Stores packed single-precision (32-bit) floating-point elements from `a` @@ -1737,7 +1742,8 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) { - maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a); + let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a) } /// Loads packed single-precision (32-bit) floating-point elements from memory @@ -1750,7 +1756,8 @@ pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 { - maskloadps(mem_addr as *const i8, mask.as_i32x4()) + let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, _mm_setzero_ps()) } /// Stores packed single-precision (32-bit) floating-point elements from `a` @@ -1762,7 +1769,8 @@ pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 { #[cfg_attr(test, assert_instr(vmaskmovps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) { - maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a); + let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a) } /// Duplicate odd-indexed single-precision (32-bit) floating-point elements @@ -3147,22 +3155,6 @@ unsafe extern "C" { fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d; #[link_name = "llvm.x86.avx.vpermilvar.pd"] fn vpermilpd(a: __m128d, b: i64x2) -> __m128d; - #[link_name = "llvm.x86.avx.maskload.pd.256"] - fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d; - #[link_name = "llvm.x86.avx.maskstore.pd.256"] - fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d); - #[link_name = "llvm.x86.avx.maskload.pd"] - fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d; - #[link_name = "llvm.x86.avx.maskstore.pd"] - fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d); - #[link_name = "llvm.x86.avx.maskload.ps.256"] - fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256; - #[link_name = "llvm.x86.avx.maskstore.ps.256"] - fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256); - #[link_name = "llvm.x86.avx.maskload.ps"] - fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128; - #[link_name = "llvm.x86.avx.maskstore.ps"] - fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128); #[link_name = "llvm.x86.avx.ldu.dq.256"] fn vlddqu(mem_addr: *const i8) -> i8x32; #[link_name = "llvm.x86.avx.rcp.ps.256"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 91c10638e0bf0..de27ee7b45efc 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -1786,7 +1786,8 @@ pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i { - transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4())) + let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i() } /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -1799,7 +1800,8 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i { - transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8())) + let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i() } /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -1812,7 +1814,8 @@ pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m2 #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i { - transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2())) + let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i() } /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -1825,7 +1828,8 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i { - transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4())) + let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63)); + simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i() } /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -1838,7 +1842,8 @@ pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m2 #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) { - maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4()) + let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4()) } /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -1851,7 +1856,8 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) #[cfg_attr(test, assert_instr(vpmaskmovd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) { - maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8()) + let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8()) } /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -1864,7 +1870,8 @@ pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m25 #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) { - maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2()) + let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2()) } /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -1877,7 +1884,8 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) #[cfg_attr(test, assert_instr(vpmaskmovq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) { - maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4()) + let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63)); + simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4()) } /// Compares packed 16-bit integers in `a` and `b`, and returns the packed @@ -3645,22 +3653,6 @@ unsafe extern "C" { fn phsubsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16; - #[link_name = "llvm.x86.avx2.maskload.d"] - fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.maskload.d.256"] - fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.maskload.q"] - fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.maskload.q.256"] - fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx2.maskstore.d"] - fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4); - #[link_name = "llvm.x86.avx2.maskstore.d.256"] - fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8); - #[link_name = "llvm.x86.avx2.maskstore.q"] - fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2); - #[link_name = "llvm.x86.avx2.maskstore.q.256"] - fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] From 83e4d8182fc372483ad1c04c8fa472044a61ba1c Mon Sep 17 00:00:00 2001 From: The rustc-josh-sync Cronjob Bot Date: Mon, 10 Nov 2025 04:10:10 +0000 Subject: [PATCH 09/17] Prepare for merging from rust-lang/rust This updates the rust-version file to 8401398e1f14a24670ee1a3203713dc2f0f8b3a8. --- library/stdarch/rust-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/stdarch/rust-version b/library/stdarch/rust-version index e313eada45431..04d41c96f5c08 100644 --- a/library/stdarch/rust-version +++ b/library/stdarch/rust-version @@ -1 +1 @@ -73e6c9ebd9123154a196300ef58e30ec8928e74e +8401398e1f14a24670ee1a3203713dc2f0f8b3a8 From 1b3abfea9459b3696252aab32278c350527f1e26 Mon Sep 17 00:00:00 2001 From: MarcoIeni <11428655+MarcoIeni@users.noreply.github.com> Date: Tue, 11 Nov 2025 10:04:15 +0100 Subject: [PATCH 10/17] rename default branch to main --- library/stdarch/.github/workflows/rustc-pull.yml | 2 +- library/stdarch/crates/core_arch/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/library/stdarch/.github/workflows/rustc-pull.yml b/library/stdarch/.github/workflows/rustc-pull.yml index 1379bd06b0e9a..ee0c498878f42 100644 --- a/library/stdarch/.github/workflows/rustc-pull.yml +++ b/library/stdarch/.github/workflows/rustc-pull.yml @@ -16,7 +16,7 @@ jobs: # https://rust-lang.zulipchat.com/#narrow/channel/208962-t-libs.2Fstdarch/topic/Subtree.20sync.20automation/with/528461782 zulip-stream-id: 208962 zulip-bot-email: "stdarch-ci-bot@rust-lang.zulipchat.com" - pr-base-branch: master + pr-base-branch: main branch-name: rustc-pull secrets: zulip-api-token: ${{ secrets.ZULIP_API_TOKEN }} diff --git a/library/stdarch/crates/core_arch/README.md b/library/stdarch/crates/core_arch/README.md index fc18a5759dbe4..d341365b987a2 100644 --- a/library/stdarch/crates/core_arch/README.md +++ b/library/stdarch/crates/core_arch/README.md @@ -3,7 +3,7 @@ The `core::arch` module implements architecture-dependent intrinsics (e.g. SIMD). -# Usage +# Usage `core::arch` is available as part of `libcore` and it is re-exported by `libstd`. Prefer using it via `core::arch` or `std::arch` than via this crate. @@ -17,7 +17,7 @@ are: you need to re-compile it for a non-standard target, please prefer using `xargo` and re-compiling `libcore`/`libstd` as appropriate instead of using this crate. - + * using some features that might not be available even behind unstable Rust features. We try to keep these to a minimum. If you need to use some of these features, please open an issue so that we can expose them in nightly Rust and @@ -34,7 +34,7 @@ are: * [How to get started][contrib] * [How to help implement intrinsics][help-implement] -[contrib]: https://github.com/rust-lang/stdarch/blob/master/CONTRIBUTING.md +[contrib]: https://github.com/rust-lang/stdarch/blob/HEAD/CONTRIBUTING.md [help-implement]: https://github.com/rust-lang/stdarch/issues/40 [i686]: https://rust-lang.github.io/stdarch/i686/core_arch/ [x86_64]: https://rust-lang.github.io/stdarch/x86_64/core_arch/ From 148a7509a1dee4cc0a017588d91d83b505ea526f Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Tue, 11 Nov 2025 10:55:37 +0100 Subject: [PATCH 11/17] add logic tests for ternarylogic previously the output would just always be all zeroes --- .../crates/core_arch/src/x86/avx512f.rs | 65 ++++++++++++++----- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index b60df7dbc9a3e..81242fa6fc3f8 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -46240,11 +46240,22 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_ternarylogic_epi32() { - let a = _mm512_set1_epi32(1 << 2); - let b = _mm512_set1_epi32(1 << 1); - let c = _mm512_set1_epi32(1 << 0); - let r = _mm512_ternarylogic_epi32::<8>(a, b, c); - let e = _mm512_set1_epi32(0); + let a = _mm512_set4_epi32(0b100, 0b110, 0b001, 0b101); + let b = _mm512_set4_epi32(0b010, 0b011, 0b001, 0b101); + let c = _mm512_set4_epi32(0b001, 0b000, 0b001, 0b101); + + // Identity of A. + let r = _mm512_ternarylogic_epi32::<0b1111_0000>(a, b, c); + assert_eq_m512i(r, a); + + // Bitwise or. + let r = _mm512_ternarylogic_epi32::<0b1111_1110>(a, b, c); + let e = _mm512_set4_epi32(0b111, 0b111, 0b001, 0b101); + assert_eq_m512i(r, e); + + // Majority. + let r = _mm512_ternarylogic_epi32::<0b1110_1000>(a, b, c); + let e = _mm512_set4_epi32(0b000, 0b010, 0b001, 0b101); assert_eq_m512i(r, e); } @@ -46274,11 +46285,24 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_ternarylogic_epi32() { - let a = _mm256_set1_epi32(1 << 2); - let b = _mm256_set1_epi32(1 << 1); - let c = _mm256_set1_epi32(1 << 0); - let r = _mm256_ternarylogic_epi32::<8>(a, b, c); - let e = _mm256_set1_epi32(0); + let _mm256_set4_epi32 = |a, b, c, d| _mm256_setr_epi32(a, b, c, d, a, b, c, d); + + let a = _mm256_set4_epi32(0b100, 0b110, 0b001, 0b101); + let b = _mm256_set4_epi32(0b010, 0b011, 0b001, 0b101); + let c = _mm256_set4_epi32(0b001, 0b000, 0b001, 0b101); + + // Identity of A. + let r = _mm256_ternarylogic_epi32::<0b1111_0000>(a, b, c); + assert_eq_m256i(r, a); + + // Bitwise or. + let r = _mm256_ternarylogic_epi32::<0b1111_1110>(a, b, c); + let e = _mm256_set4_epi32(0b111, 0b111, 0b001, 0b101); + assert_eq_m256i(r, e); + + // Majority. + let r = _mm256_ternarylogic_epi32::<0b1110_1000>(a, b, c); + let e = _mm256_set4_epi32(0b000, 0b010, 0b001, 0b101); assert_eq_m256i(r, e); } @@ -46308,11 +46332,22 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_ternarylogic_epi32() { - let a = _mm_set1_epi32(1 << 2); - let b = _mm_set1_epi32(1 << 1); - let c = _mm_set1_epi32(1 << 0); - let r = _mm_ternarylogic_epi32::<8>(a, b, c); - let e = _mm_set1_epi32(0); + let a = _mm_setr_epi32(0b100, 0b110, 0b001, 0b101); + let b = _mm_setr_epi32(0b010, 0b011, 0b001, 0b101); + let c = _mm_setr_epi32(0b001, 0b000, 0b001, 0b101); + + // Identity of A. + let r = _mm_ternarylogic_epi32::<0b1111_0000>(a, b, c); + assert_eq_m128i(r, a); + + // Bitwise or. + let r = _mm_ternarylogic_epi32::<0b1111_1110>(a, b, c); + let e = _mm_setr_epi32(0b111, 0b111, 0b001, 0b101); + assert_eq_m128i(r, e); + + // Majority. + let r = _mm_ternarylogic_epi32::<0b1110_1000>(a, b, c); + let e = _mm_setr_epi32(0b000, 0b010, 0b001, 0b101); assert_eq_m128i(r, e); } From e94ac6b638257d58052e3cde19ac62ed850348b1 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Tue, 11 Nov 2025 17:04:11 +0100 Subject: [PATCH 12/17] improve ternary logic tests --- .../crates/core_arch/src/x86/avx512f.rs | 51 +++++++++++-------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 81242fa6fc3f8..b5d0daae6635a 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -46240,22 +46240,25 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_ternarylogic_epi32() { + use core::intrinsics::simd::simd_xor; + let a = _mm512_set4_epi32(0b100, 0b110, 0b001, 0b101); - let b = _mm512_set4_epi32(0b010, 0b011, 0b001, 0b101); - let c = _mm512_set4_epi32(0b001, 0b000, 0b001, 0b101); + let b = _mm512_set4_epi32(0b010, 0b011, 0b001, 0b110); + let c = _mm512_set4_epi32(0b001, 0b000, 0b001, 0b111); // Identity of A. let r = _mm512_ternarylogic_epi32::<0b1111_0000>(a, b, c); assert_eq_m512i(r, a); - // Bitwise or. - let r = _mm512_ternarylogic_epi32::<0b1111_1110>(a, b, c); - let e = _mm512_set4_epi32(0b111, 0b111, 0b001, 0b101); + // Bitwise xor. + let r = _mm512_ternarylogic_epi32::<0b10010110>(a, b, c); + let e = _mm512_set4_epi32(0b111, 0b101, 0b001, 0b100); assert_eq_m512i(r, e); + assert_eq_m512i(r, simd_xor(simd_xor(a, b), c)); - // Majority. + // Majority (2 or more bits set). let r = _mm512_ternarylogic_epi32::<0b1110_1000>(a, b, c); - let e = _mm512_set4_epi32(0b000, 0b010, 0b001, 0b101); + let e = _mm512_set4_epi32(0b000, 0b010, 0b001, 0b111); assert_eq_m512i(r, e); } @@ -46285,24 +46288,27 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_ternarylogic_epi32() { + use core::intrinsics::simd::simd_xor; + let _mm256_set4_epi32 = |a, b, c, d| _mm256_setr_epi32(a, b, c, d, a, b, c, d); let a = _mm256_set4_epi32(0b100, 0b110, 0b001, 0b101); - let b = _mm256_set4_epi32(0b010, 0b011, 0b001, 0b101); - let c = _mm256_set4_epi32(0b001, 0b000, 0b001, 0b101); + let b = _mm256_set4_epi32(0b010, 0b011, 0b001, 0b110); + let c = _mm256_set4_epi32(0b001, 0b000, 0b001, 0b111); // Identity of A. let r = _mm256_ternarylogic_epi32::<0b1111_0000>(a, b, c); assert_eq_m256i(r, a); - // Bitwise or. - let r = _mm256_ternarylogic_epi32::<0b1111_1110>(a, b, c); - let e = _mm256_set4_epi32(0b111, 0b111, 0b001, 0b101); + // Bitwise xor. + let r = _mm256_ternarylogic_epi32::<0b10010110>(a, b, c); + let e = _mm256_set4_epi32(0b111, 0b101, 0b001, 0b100); assert_eq_m256i(r, e); + assert_eq_m256i(r, simd_xor(simd_xor(a, b), c)); - // Majority. + // Majority (2 or more bits set). let r = _mm256_ternarylogic_epi32::<0b1110_1000>(a, b, c); - let e = _mm256_set4_epi32(0b000, 0b010, 0b001, 0b101); + let e = _mm256_set4_epi32(0b000, 0b010, 0b001, 0b111); assert_eq_m256i(r, e); } @@ -46332,22 +46338,25 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_ternarylogic_epi32() { + use core::intrinsics::simd::simd_xor; + let a = _mm_setr_epi32(0b100, 0b110, 0b001, 0b101); - let b = _mm_setr_epi32(0b010, 0b011, 0b001, 0b101); - let c = _mm_setr_epi32(0b001, 0b000, 0b001, 0b101); + let b = _mm_setr_epi32(0b010, 0b011, 0b001, 0b110); + let c = _mm_setr_epi32(0b001, 0b000, 0b001, 0b111); // Identity of A. let r = _mm_ternarylogic_epi32::<0b1111_0000>(a, b, c); assert_eq_m128i(r, a); - // Bitwise or. - let r = _mm_ternarylogic_epi32::<0b1111_1110>(a, b, c); - let e = _mm_setr_epi32(0b111, 0b111, 0b001, 0b101); + // Bitwise xor. + let r = _mm_ternarylogic_epi32::<0b10010110>(a, b, c); + let e = _mm_setr_epi32(0b111, 0b101, 0b001, 0b100); assert_eq_m128i(r, e); + assert_eq_m128i(r, simd_xor(simd_xor(a, b), c)); - // Majority. + // Majority (2 or more bits set). let r = _mm_ternarylogic_epi32::<0b1110_1000>(a, b, c); - let e = _mm_setr_epi32(0b000, 0b010, 0b001, 0b101); + let e = _mm_setr_epi32(0b000, 0b010, 0b001, 0b111); assert_eq_m128i(r, e); } From 0ab7c9e3dac4c8c69267f73ea1e6deaecaacfae3 Mon Sep 17 00:00:00 2001 From: sayantn Date: Wed, 12 Nov 2025 07:36:22 +0530 Subject: [PATCH 13/17] Use SIMD intrinsics for vector shifts --- .../stdarch/crates/core_arch/src/x86/avx2.rs | 90 ++++++++++++------- .../crates/core_arch/src/x86/avx512bw.rs | 84 ++++++++++------- .../crates/core_arch/src/x86/avx512f.rs | 75 ++++++++++------ 3 files changed, 162 insertions(+), 87 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index de27ee7b45efc..2e6e010a21f08 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -2786,7 +2786,12 @@ pub fn _mm256_bslli_epi128(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x4::ZERO); + simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i() + } } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2799,7 +2804,12 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x8::ZERO); + simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i() + } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2812,7 +2822,12 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x2::ZERO); + simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i() + } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2825,7 +2840,12 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x4::ZERO); + simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i() + } } /// Shifts packed 16-bit integers in `a` right by `count` while @@ -2889,7 +2909,12 @@ pub fn _mm256_srai_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS)); + let count = simd_select(no_overflow, transmute(count), i32x4::splat(31)); + simd_shr(a.as_i32x4(), count).as_m128i() + } } /// Shifts packed 32-bit integers in `a` right by the amount specified by the @@ -2901,7 +2926,12 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS)); + let count = simd_select(no_overflow, transmute(count), i32x8::splat(31)); + simd_shr(a.as_i32x8(), count).as_m256i() + } } /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -3084,7 +3114,12 @@ pub fn _mm256_srli_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x4::ZERO); + simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i() + } } /// Shifts packed 32-bit integers in `a` right by the amount specified by @@ -3096,7 +3131,12 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x8::ZERO); + simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i() + } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3108,7 +3148,12 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x2::ZERO); + simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i() + } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3120,7 +3165,12 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x4::ZERO); + simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i() + } } /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr @@ -3679,36 +3729,16 @@ unsafe extern "C" { fn pslld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psll.q"] fn psllq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psllv.d"] - fn psllvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psllv.d.256"] - fn psllvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psllv.q"] - fn psllvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psllv.q.256"] - fn psllvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.psra.w"] fn psraw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psra.d"] fn psrad(a: i32x8, count: i32x4) -> i32x8; - #[link_name = "llvm.x86.avx2.psrav.d"] - fn psravd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrav.d.256"] - fn psravd256(a: i32x8, count: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.w"] fn psrlw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psrl.d"] fn psrld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.q"] fn psrlq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psrlv.d"] - fn psrlvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrlv.d.256"] - fn psrlvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psrlv.q"] - fn psrlvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psrlv.q.256"] - fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.pshuf.b"] fn pshufb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.permd"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 72842f4546754..78f4dd53b96df 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -6864,7 +6864,12 @@ pub fn _mm_maskz_slli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let no_overflow: u16x32 = simd_lt(count, u16x32::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x32::ZERO); + simd_select(no_overflow, simd_shl(a.as_u16x32(), count), u16x32::ZERO).as_m512i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6903,7 +6908,12 @@ pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let no_overflow: u16x16 = simd_lt(count, u16x16::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x16::ZERO); + simd_select(no_overflow, simd_shl(a.as_u16x16(), count), u16x16::ZERO).as_m256i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6942,7 +6952,12 @@ pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let no_overflow: u16x8 = simd_lt(count, u16x8::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x8::ZERO); + simd_select(no_overflow, simd_shl(a.as_u16x8(), count), u16x8::ZERO).as_m128i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7200,7 +7215,12 @@ pub fn _mm_maskz_srli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let no_overflow: u16x32 = simd_lt(count, u16x32::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x32::ZERO); + simd_select(no_overflow, simd_shr(a.as_u16x32(), count), u16x32::ZERO).as_m512i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7239,7 +7259,12 @@ pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let no_overflow: u16x16 = simd_lt(count, u16x16::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x16::ZERO); + simd_select(no_overflow, simd_shr(a.as_u16x16(), count), u16x16::ZERO).as_m256i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7278,7 +7303,12 @@ pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let no_overflow: u16x8 = simd_lt(count, u16x8::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, count, u16x8::ZERO); + simd_select(no_overflow, simd_shr(a.as_u16x8(), count), u16x8::ZERO).as_m128i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7523,7 +7553,12 @@ pub fn _mm_maskz_srai_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let no_overflow: u16x32 = simd_lt(count, u16x32::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, transmute(count), i16x32::splat(15)); + simd_shr(a.as_i16x32(), count).as_m512i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7562,7 +7597,12 @@ pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let no_overflow: u16x16 = simd_lt(count, u16x16::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, transmute(count), i16x16::splat(15)); + simd_shr(a.as_i16x16(), count).as_m256i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7601,7 +7641,12 @@ pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let no_overflow: u16x8 = simd_lt(count, u16x8::splat(u16::BITS as u16)); + let count = simd_select(no_overflow, transmute(count), i16x8::splat(15)); + simd_shr(a.as_i16x8(), count).as_m128i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11657,33 +11702,12 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psll.w.512"] fn vpsllw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.512"] - fn vpsllvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.256"] - fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psllv.w.128"] - fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psrl.w.512"] fn vpsrlw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.512"] - fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.256"] - fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrlv.w.128"] - fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psra.w.512"] fn vpsraw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.512"] - fn vpsravw(a: i16x32, count: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.256"] - fn vpsravw256(a: i16x16, count: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrav.w.128"] - fn vpsravw128(a: i16x8, count: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"] fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32; #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index c87946846e65c..0ff0e7575c501 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -20940,7 +20940,12 @@ pub fn _mm_maskz_srai_epi64(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravd))] pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let no_overflow: u32x16 = simd_lt(count, u32x16::splat(u32::BITS)); + let count = simd_select(no_overflow, transmute(count), i32x16::splat(31)); + simd_shr(a.as_i32x16(), count).as_m512i() + } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21035,7 +21040,12 @@ pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let no_overflow: u64x8 = simd_lt(count, u64x8::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, transmute(count), i64x8::splat(63)); + simd_shr(a.as_i64x8(), count).as_m512i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21074,7 +21084,12 @@ pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m51 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, transmute(count), i64x4::splat(63)); + simd_shr(a.as_i64x4(), count).as_m256i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21113,7 +21128,12 @@ pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m25 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, transmute(count), i64x2::splat(63)); + simd_shr(a.as_i64x2(), count).as_m128i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21692,7 +21712,12 @@ pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvd))] pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let no_overflow: u32x16 = simd_lt(count, u32x16::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x16::ZERO); + simd_select(no_overflow, simd_shl(a.as_u32x16(), count), u32x16::ZERO).as_m512i() + } } /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21787,7 +21812,12 @@ pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvd))] pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let no_overflow: u32x16 = simd_lt(count, u32x16::splat(u32::BITS)); + let count = simd_select(no_overflow, count, u32x16::ZERO); + simd_select(no_overflow, simd_shr(a.as_u32x16(), count), u32x16::ZERO).as_m512i() + } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21882,7 +21912,12 @@ pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvq))] pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let no_overflow: u64x8 = simd_lt(count, u64x8::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x8::ZERO); + simd_select(no_overflow, simd_shl(a.as_u64x8(), count), u64x8::ZERO).as_m512i() + } } /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21977,7 +22012,12 @@ pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvq))] pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let no_overflow: u64x8 = simd_lt(count, u64x8::splat(u64::BITS as u64)); + let count = simd_select(no_overflow, count, u64x8::ZERO); + simd_select(no_overflow, simd_shr(a.as_u64x8(), count), u64x8::ZERO).as_m512i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -42881,15 +42921,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"] fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.psllv.d.512"] - fn vpsllvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psrlv.d.512"] - fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psllv.q.512"] - fn vpsllvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrlv.q.512"] - fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psll.d.512"] fn vpslld(a: i32x16, count: i32x4) -> i32x16; #[link_name = "llvm.x86.avx512.psrl.d.512"] @@ -42909,16 +42940,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psra.q.128"] fn vpsraq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.psrav.d.512"] - fn vpsravd(a: i32x16, count: i32x16) -> i32x16; - - #[link_name = "llvm.x86.avx512.psrav.q.512"] - fn vpsravq(a: i64x8, count: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrav.q.256"] - fn vpsravq256(a: i64x4, count: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.psrav.q.128"] - fn vpsravq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"] fn vpermilps(a: f32x16, b: i32x16) -> f32x16; #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"] From 0882a6e02a61daa42bb221b33e6ca74e9fd07ce6 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Fri, 14 Nov 2025 19:25:34 +0000 Subject: [PATCH 14/17] aarch64: Remove withdrawn FEAT_TME ARM has withdrawn FEAT_TME https://developer.arm.com/documentation/102105/lb-05/ LLVM has also dropped support for enabling the feature. --- .../aarch64-unknown-linux-gnu/Dockerfile | 3 +- .../aarch64_be-unknown-linux-gnu/Dockerfile | 1 - .../crates/core_arch/src/aarch64/mod.rs | 4 - .../crates/core_arch/src/aarch64/tme.rs | 201 ------------------ .../crates/stdarch-test/src/disassembly.rs | 2 +- .../crates/stdarch-verify/tests/arm.rs | 1 - 6 files changed, 2 insertions(+), 210 deletions(-) delete mode 100644 library/stdarch/crates/core_arch/src/aarch64/tme.rs diff --git a/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile index 70c06509755cb..2768c521ebccc 100644 --- a/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile +++ b/library/stdarch/ci/docker/aarch64-unknown-linux-gnu/Dockerfile @@ -15,5 +15,4 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \ CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -cpu max -L /usr/aarch64-linux-gnu" \ - OBJDUMP=aarch64-linux-gnu-objdump \ - STDARCH_TEST_SKIP_FEATURE=tme + OBJDUMP=aarch64-linux-gnu-objdump diff --git a/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile index 56ddbd990b18b..f85c6a2592e99 100644 --- a/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile +++ b/library/stdarch/ci/docker/aarch64_be-unknown-linux-gnu/Dockerfile @@ -27,4 +27,3 @@ ENV AARCH64_BE_LIBC="${AARCH64_BE_TOOLCHAIN}/aarch64_be-none-linux-gnu/libc" ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-gcc" ENV CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64_be -cpu max -L ${AARCH64_BE_LIBC}" ENV OBJDUMP="${AARCH64_BE_TOOLCHAIN}/bin/aarch64_be-none-linux-gnu-objdump" -ENV STDARCH_TEST_SKIP_FEATURE=tme diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs index f4b9b1c30251e..b48bdac57e7db 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/mod.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs @@ -21,10 +21,6 @@ mod neon; #[stable(feature = "neon_intrinsics", since = "1.59.0")] pub use self::neon::*; -mod tme; -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub use self::tme::*; - mod prefetch; #[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")] pub use self::prefetch::*; diff --git a/library/stdarch/crates/core_arch/src/aarch64/tme.rs b/library/stdarch/crates/core_arch/src/aarch64/tme.rs deleted file mode 100644 index 207633c1f8d34..0000000000000 --- a/library/stdarch/crates/core_arch/src/aarch64/tme.rs +++ /dev/null @@ -1,201 +0,0 @@ -//! ARM's Transactional Memory Extensions (TME). -//! -//! This CPU feature is available on Aarch64 - A architecture profile. -//! This feature is in the non-neon feature set. TME specific vendor documentation can -//! be found [TME Intrinsics Introduction][tme_intrinsics_intro]. -//! -//! The reference is [ACLE Q4 2019][acle_q4_2019_ref]. -//! -//! ACLE has a section for TME extensions and state masks for aborts and failure codes. -//! [ARM A64 Architecture Register Datasheet][a_profile_future] also describes possible failure code scenarios. -//! -//! [acle_q4_2019_ref]: https://static.docs.arm.com/101028/0010/ACLE_2019Q4_release-0010.pdf -//! [tme_intrinsics_intro]: https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics -//! [llvm_aarch64_int]: https://github.com/llvm/llvm-project/commit/a36d31478c182903523e04eb271bbf102bfab2cc#diff-ff24e1c35f4d54f1110ce5d90c709319R626-R646 -//! [a_profile_future]: https://static.docs.arm.com/ddi0601/a/SysReg_xml_futureA-2019-04.pdf?_ga=2.116560387.441514988.1590524918-1110153136.1588469296 - -#[cfg(test)] -use stdarch_test::assert_instr; - -unsafe extern "unadjusted" { - #[link_name = "llvm.aarch64.tstart"] - fn aarch64_tstart() -> u64; - #[link_name = "llvm.aarch64.tcommit"] - fn aarch64_tcommit(); - #[link_name = "llvm.aarch64.tcancel"] - fn aarch64_tcancel(imm0: u64); - #[link_name = "llvm.aarch64.ttest"] - fn aarch64_ttest() -> u64; -} - -/// Transaction successfully started. -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMSTART_SUCCESS: u64 = 0x00_u64; - -/// Extraction mask for failure reason -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_REASON: u64 = 0x00007FFF_u64; - -/// Transaction retry is possible. -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_RTRY: u64 = 1 << 15; - -/// Transaction executed a TCANCEL instruction -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_CNCL: u64 = 1 << 16; - -/// Transaction aborted because a conflict occurred -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_MEM: u64 = 1 << 17; - -/// Fallback error type for any other reason -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_IMP: u64 = 1 << 18; - -/// Transaction aborted because a non-permissible operation was attempted -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_ERR: u64 = 1 << 19; - -/// Transaction aborted due to read or write set limit was exceeded -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_SIZE: u64 = 1 << 20; - -/// Transaction aborted due to transactional nesting level was exceeded -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_NEST: u64 = 1 << 21; - -/// Transaction aborted due to a debug trap. -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_DBG: u64 = 1 << 22; - -/// Transaction failed from interrupt -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_INT: u64 = 1 << 23; - -/// Indicates a TRIVIAL version of TM is available -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub const _TMFAILURE_TRIVIAL: u64 = 1 << 24; - -// NOTE: Tests for these instructions are disabled on MSVC as dumpbin doesn't -// understand these instructions. - -/// Starts a new transaction. When the transaction starts successfully the return value is 0. -/// If the transaction fails, all state modifications are discarded and a cause of the failure -/// is encoded in the return value. -/// -/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics). -#[inline] -#[target_feature(enable = "tme")] -#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(tstart))] -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub unsafe fn __tstart() -> u64 { - aarch64_tstart() -} - -/// Commits the current transaction. For a nested transaction, the only effect is that the -/// transactional nesting depth is decreased. For an outer transaction, the state modifications -/// performed transactionally are committed to the architectural state. -/// -/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics). -#[inline] -#[target_feature(enable = "tme")] -#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(tcommit))] -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub unsafe fn __tcommit() { - aarch64_tcommit() -} - -/// Cancels the current transaction and discards all state modifications that were performed transactionally. -/// -/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics). -#[inline] -#[target_feature(enable = "tme")] -#[cfg_attr( - all(test, not(target_env = "msvc")), - assert_instr(tcancel, IMM16 = 0x0) -)] -#[rustc_legacy_const_generics(0)] -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub unsafe fn __tcancel() { - static_assert!(IMM16 <= 65535); - aarch64_tcancel(IMM16); -} - -/// Tests if executing inside a transaction. If no transaction is currently executing, -/// the return value is 0. Otherwise, this intrinsic returns the depth of the transaction. -/// -/// [ARM TME Intrinsics](https://developer.arm.com/docs/101028/0010/transactional-memory-extension-tme-intrinsics). -#[inline] -#[target_feature(enable = "tme")] -#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(ttest))] -#[unstable(feature = "stdarch_aarch64_tme", issue = "117216")] -pub unsafe fn __ttest() -> u64 { - aarch64_ttest() -} - -#[cfg(test)] -mod tests { - use stdarch_test::simd_test; - - use crate::core_arch::aarch64::*; - - const CANCEL_CODE: u64 = (0 | (0x123 & _TMFAILURE_REASON) as u64) as u64; - - #[simd_test(enable = "tme")] - unsafe fn test_tstart() { - let mut x = 0; - for i in 0..10 { - let code = tme::__tstart(); - if code == _TMSTART_SUCCESS { - x += 1; - assert_eq!(x, i + 1); - break; - } - assert_eq!(x, 0); - } - } - - #[simd_test(enable = "tme")] - unsafe fn test_tcommit() { - let mut x = 0; - for i in 0..10 { - let code = tme::__tstart(); - if code == _TMSTART_SUCCESS { - x += 1; - assert_eq!(x, i + 1); - tme::__tcommit(); - } - assert_eq!(x, i + 1); - } - } - - #[simd_test(enable = "tme")] - unsafe fn test_tcancel() { - let mut x = 0; - - for i in 0..10 { - let code = tme::__tstart(); - if code == _TMSTART_SUCCESS { - x += 1; - assert_eq!(x, i + 1); - tme::__tcancel::(); - break; - } - } - - assert_eq!(x, 0); - } - - #[simd_test(enable = "tme")] - unsafe fn test_ttest() { - for _ in 0..10 { - let code = tme::__tstart(); - if code == _TMSTART_SUCCESS { - if tme::__ttest() == 2 { - tme::__tcancel::(); - break; - } - } - } - } -} diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs index 4c136cff02ae6..237e8d2dc28a0 100644 --- a/library/stdarch/crates/stdarch-test/src/disassembly.rs +++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs @@ -78,7 +78,7 @@ pub(crate) fn disassemble_myself() -> HashSet { let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string()); let add_args = if cfg!(target_vendor = "apple") && cfg!(target_arch = "aarch64") { // Target features need to be enabled for LLVM objdump on Darwin ARM64 - vec!["--mattr=+v8.6a,+crypto,+tme"] + vec!["--mattr=+v8.6a,+crypto"] } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) { vec!["--mattr=+zk,+zks,+zbc,+zbb"] } else { diff --git a/library/stdarch/crates/stdarch-verify/tests/arm.rs b/library/stdarch/crates/stdarch-verify/tests/arm.rs index a35b8175fb223..86897908e062c 100644 --- a/library/stdarch/crates/stdarch-verify/tests/arm.rs +++ b/library/stdarch/crates/stdarch-verify/tests/arm.rs @@ -444,7 +444,6 @@ fn verify_all_signatures() { && !rust.file.ends_with("v6.rs\"") && !rust.file.ends_with("v7.rs\"") && !rust.file.ends_with("v8.rs\"") - && !rust.file.ends_with("tme.rs\"") && !rust.file.ends_with("mte.rs\"") && !rust.file.ends_with("ex.rs\"") && !skip_intrinsic_verify.contains(&rust.name) From 8fe87e96235801772df7e5a41482b322c8b65db3 Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 11 Nov 2025 04:57:42 +0530 Subject: [PATCH 15/17] correct some `#[simd_test]` attributes --- .../stdarch/crates/core_arch/src/x86/avx2.rs | 2 +- .../crates/core_arch/src/x86/avx512bw.rs | 132 +++++++++--------- .../crates/core_arch/src/x86/avx512f.rs | 14 +- .../crates/core_arch/src/x86/avx512fp16.rs | 40 +++--- .../stdarch/crates/core_arch/src/x86/rdtsc.rs | 12 +- .../stdarch/crates/core_arch/src/x86/sse.rs | 5 +- .../crates/core_arch/src/x86_64/amx.rs | 4 +- .../crates/core_arch/src/x86_64/avx512f.rs | 6 +- 8 files changed, 110 insertions(+), 105 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 2e6e010a21f08..e8213615a22e1 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -5749,7 +5749,7 @@ mod tests { assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0)); } - #[simd_test(enable = "avx")] + #[simd_test(enable = "avx2")] unsafe fn test_mm256_extract_epi8() { #[rustfmt::skip] let a = _mm256_setr_epi8( diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 78f4dd53b96df..aee705fb46125 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -13335,7 +13335,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_max_epu16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13352,7 +13352,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_max_epu16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13369,7 +13369,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_max_epu16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13380,7 +13380,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_max_epu16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13391,7 +13391,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_max_epu16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13402,7 +13402,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_max_epu16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13434,7 +13434,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_max_epu8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13462,7 +13462,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_max_epu8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13489,7 +13489,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_max_epu8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13506,7 +13506,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_max_epu8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13523,7 +13523,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_max_epu8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13534,7 +13534,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_max_epu8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13560,7 +13560,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_max_epi16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13577,7 +13577,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_max_epi16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13594,7 +13594,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_max_epi16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13605,7 +13605,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_max_epi16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13616,7 +13616,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_max_epi16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13627,7 +13627,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_max_epi16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13659,7 +13659,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_max_epi8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13687,7 +13687,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_max_epi8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13714,7 +13714,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_max_epi8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13731,7 +13731,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_max_epi8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13748,7 +13748,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_max_epi8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13759,7 +13759,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_max_epi8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13785,7 +13785,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_min_epu16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13802,7 +13802,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_min_epu16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13819,7 +13819,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_min_epu16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13830,7 +13830,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_min_epu16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13841,7 +13841,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_min_epu16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13852,7 +13852,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_min_epu16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -13884,7 +13884,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_min_epu8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13912,7 +13912,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_min_epu8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13939,7 +13939,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_min_epu8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13956,7 +13956,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_min_epu8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -13973,7 +13973,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_min_epu8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -13984,7 +13984,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_min_epu8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -14010,7 +14010,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_min_epi16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14027,7 +14027,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_min_epi16() { #[rustfmt::skip] let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14044,7 +14044,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_min_epi16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -14055,7 +14055,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_min_epi16() { let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -14066,7 +14066,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_min_epi16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -14077,7 +14077,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_min_epi16() { let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); let b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); @@ -14109,7 +14109,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_min_epi8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14137,7 +14137,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_min_epi8() { #[rustfmt::skip] let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14164,7 +14164,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_min_epi8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14181,7 +14181,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_min_epi8() { #[rustfmt::skip] let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -14198,7 +14198,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_min_epi8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -14209,7 +14209,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_min_epi8() { let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -16326,7 +16326,7 @@ mod tests { assert_eq_m128i(r, a); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_loadu_epi16() { let src = _mm512_set1_epi16(42); let a = &[ @@ -16344,7 +16344,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_loadu_epi16() { let a = &[ 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -16361,7 +16361,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_storeu_epi16() { let mut r = [42_i16; 32]; let a = &[ @@ -16379,7 +16379,7 @@ mod tests { assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_loadu_epi8() { let src = _mm512_set1_epi8(42); let a = &[ @@ -16399,7 +16399,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_maskz_loadu_epi8() { let a = &[ 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -16418,7 +16418,7 @@ mod tests { assert_eq_m512i(r, e); } - #[simd_test(enable = "avx512f,avx512bw")] + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_mask_storeu_epi8() { let mut r = [42_i8; 64]; let a = &[ @@ -16438,7 +16438,7 @@ mod tests { assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_loadu_epi16() { let src = _mm256_set1_epi16(42); let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; @@ -16452,7 +16452,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_loadu_epi16() { let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let p = a.as_ptr(); @@ -16463,7 +16463,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_storeu_epi16() { let mut r = [42_i16; 16]; let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; @@ -16477,7 +16477,7 @@ mod tests { assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_loadu_epi8() { let src = _mm256_set1_epi8(42); let a = &[ @@ -16495,7 +16495,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_maskz_loadu_epi8() { let a = &[ 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -16512,7 +16512,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm256_mask_storeu_epi8() { let mut r = [42_i8; 32]; let a = &[ @@ -16530,7 +16530,7 @@ mod tests { assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_loadu_epi16() { let src = _mm_set1_epi16(42); let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; @@ -16542,7 +16542,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_loadu_epi16() { let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; let p = a.as_ptr(); @@ -16553,7 +16553,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_storeu_epi16() { let mut r = [42_i16; 8]; let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; @@ -16565,7 +16565,7 @@ mod tests { assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_loadu_epi8() { let src = _mm_set1_epi8(42); let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; @@ -16579,7 +16579,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_maskz_loadu_epi8() { let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let p = a.as_ptr(); @@ -16590,7 +16590,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + #[simd_test(enable = "avx512bw,avx512vl")] unsafe fn test_mm_mask_storeu_epi8() { let mut r = [42_i8; 16]; let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index ebffb9ac92983..f7bf9178dbb99 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -56076,7 +56076,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test(enable = "avx512dq")] + #[simd_test(enable = "avx512f")] unsafe fn test_kortest_mask16_u8() { let a: __mmask16 = 0b0110100101101001; let b: __mmask16 = 0b1011011010110110; @@ -56086,7 +56086,7 @@ mod tests { assert_eq!(all_ones, 1); } - #[simd_test(enable = "avx512dq")] + #[simd_test(enable = "avx512f")] unsafe fn test_kortestc_mask16_u8() { let a: __mmask16 = 0b0110100101101001; let b: __mmask16 = 0b1011011010110110; @@ -56094,7 +56094,7 @@ mod tests { assert_eq!(r, 1); } - #[simd_test(enable = "avx512dq")] + #[simd_test(enable = "avx512f")] unsafe fn test_kortestz_mask16_u8() { let a: __mmask16 = 0b0110100101101001; let b: __mmask16 = 0b1011011010110110; @@ -56102,7 +56102,7 @@ mod tests { assert_eq!(r, 0); } - #[simd_test(enable = "avx512dq")] + #[simd_test(enable = "avx512f")] unsafe fn test_kshiftli_mask16() { let a: __mmask16 = 0b1001011011000011; let r = _kshiftli_mask16::<3>(a); @@ -56122,7 +56122,7 @@ mod tests { assert_eq!(r, e); } - #[simd_test(enable = "avx512dq")] + #[simd_test(enable = "avx512f")] unsafe fn test_kshiftri_mask16() { let a: __mmask16 = 0b1010100100111100; let r = _kshiftri_mask16::<3>(a); @@ -57383,7 +57383,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_maskz_set1_epi32() { let a: i32 = 11; let r = _mm256_maskz_set1_epi32(0, a); @@ -57404,7 +57404,7 @@ mod tests { assert_eq_m128i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm_maskz_set1_epi32() { let a: i32 = 11; let r = _mm_maskz_set1_epi32(0, a); diff --git a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs index 293fda3064dcb..13cae45d0f811 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs @@ -20766,7 +20766,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_mask_fmsub_round_sh() { let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.); let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.); @@ -20783,7 +20783,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_mask3_fmsub_round_sh() { let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.); let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.); @@ -20800,7 +20800,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm_maskz_fmsub_round_sh() { let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.); let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.); @@ -24529,7 +24529,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvtepi32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let src = _mm256_set_ph( @@ -24542,7 +24542,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvtepi32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a); @@ -24552,7 +24552,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_cvt_roundepi32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); @@ -24562,7 +24562,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvt_roundepi32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let src = _mm256_set_ph( @@ -24579,7 +24579,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvt_roundepi32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( @@ -24658,7 +24658,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_cvtepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_cvtepu32_ph(a); @@ -24668,7 +24668,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvtepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let src = _mm256_set_ph( @@ -24681,7 +24681,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvtepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a); @@ -24691,7 +24691,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_cvt_roundepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a); @@ -24701,7 +24701,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvt_roundepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let src = _mm256_set_ph( @@ -24719,7 +24719,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvt_roundepu32_ph() { let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>( @@ -25006,7 +25006,7 @@ mod tests { assert_eq_m128h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_cvtxps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -25018,7 +25018,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvtxps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -25033,7 +25033,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvtxps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -25045,7 +25045,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_cvtx_roundps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -25057,7 +25057,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_mask_cvtx_roundps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, @@ -25077,7 +25077,7 @@ mod tests { assert_eq_m256h(r, e); } - #[simd_test(enable = "avx512fp16")] + #[simd_test(enable = "avx512fp16,avx512vl")] unsafe fn test_mm512_maskz_cvtx_roundps_ph() { let a = _mm512_set_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, diff --git a/library/stdarch/crates/core_arch/src/x86/rdtsc.rs b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs index 3b348153d602d..89292d78af16c 100644 --- a/library/stdarch/crates/core_arch/src/x86/rdtsc.rs +++ b/library/stdarch/crates/core_arch/src/x86/rdtsc.rs @@ -64,16 +64,16 @@ mod tests { use crate::core_arch::x86::*; use stdarch_test::simd_test; - #[simd_test(enable = "sse2")] - unsafe fn test_rdtsc() { - let r = _rdtsc(); + #[test] + fn test_rdtsc() { + let r = unsafe { _rdtsc() }; assert_ne!(r, 0); // The chances of this being 0 are infinitesimal } - #[simd_test(enable = "sse2")] - unsafe fn test_rdtscp() { + #[test] + fn test_rdtscp() { let mut aux = 0; - let r = __rdtscp(&mut aux); + let r = unsafe { __rdtscp(&mut aux) }; assert_ne!(r, 0); // The chances of this being 0 are infinitesimal } } diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs index 86f743e76d882..7dd96dd1c9d76 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse.rs @@ -3052,8 +3052,9 @@ mod tests { assert_eq_m128(r, _mm_set1_ps(0.0)); } - #[simd_test(enable = "sse")] - unsafe fn test_MM_SHUFFLE() { + #[test] + #[allow(non_snake_case)] + fn test_MM_SHUFFLE() { assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); diff --git a/library/stdarch/crates/core_arch/src/x86_64/amx.rs b/library/stdarch/crates/core_arch/src/x86_64/amx.rs index c87514980df6f..3e0ac8f47cea0 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/amx.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/amx.rs @@ -937,7 +937,7 @@ mod tests { assert_eq!(res, [[128.0_f32; 16]; 16]); } - #[simd_test(enable = "amx-tile")] + #[simd_test(enable = "amx-movrs")] unsafe fn test_tile_loaddrs() { _init_amx(); let mut config = __tilecfg::default(); @@ -954,7 +954,7 @@ mod tests { assert_eq!(out, [[1; 64]; 16]); } - #[simd_test(enable = "amx-tile")] + #[simd_test(enable = "amx-movrs")] unsafe fn test_tile_stream_loaddrs() { _init_amx(); let mut config = __tilecfg::default(); diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs index 934c9e2812c42..a2656c8535634 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs @@ -6453,6 +6453,7 @@ mod tests { assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.)); } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set1_epi64() { let r = _mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2); assert_eq_m512i(r, _mm512_set1_epi64(2)); @@ -6464,6 +6465,7 @@ mod tests { assert_eq_m512d(expected, _mm512_set1_pd(2.)); } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set4_epi64() { let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1); assert_eq_m512i(r, _mm512_set4_epi64(4, 3, 2, 1)); @@ -6475,6 +6477,7 @@ mod tests { assert_eq_m512d(r, _mm512_set4_pd(4., 3., 2., 1.)); } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr4_epi64() { let r = _mm512_set_epi64(4, 3, 2, 1, 4, 3, 2, 1); assert_eq_m512i(r, _mm512_setr4_epi64(1, 2, 3, 4)); @@ -7335,6 +7338,7 @@ mod tests { assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmpneq_epi64_mask() { let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); @@ -9685,7 +9689,7 @@ mod tests { assert_eq_m256i(r, e); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_maskz_permutex_epi64() { let a = _mm256_set_epi64x(3, 2, 1, 0); let r = _mm256_maskz_permutex_epi64::<0b11_11_11_11>(0, a); From ac2d97254e14e52f3e5e948405715d78e01f17a8 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Mon, 17 Nov 2025 16:01:02 +0100 Subject: [PATCH 16/17] correct signedness of pmadd arguments --- library/stdarch/crates/core_arch/src/x86/avx2.rs | 4 ++-- library/stdarch/crates/core_arch/src/x86/avx512bw.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index e8213615a22e1..8be302cabc778 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -1773,7 +1773,7 @@ pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmaddubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) } + unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) } } /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -3702,7 +3702,7 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx2.phsub.sw"] fn phsubsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] - fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16; + fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16; #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index aee705fb46125..0e2dd3ad4068f 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -5955,7 +5955,7 @@ pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddubsw))] pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) } + unsafe { transmute(vpmaddubsw(a.as_u8x64(), b.as_i8x64())) } } /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11688,7 +11688,7 @@ unsafe extern "C" { fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32; #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] - fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32; + fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32; #[link_name = "llvm.x86.avx512.packssdw.512"] fn vpackssdw(a: i32x16, b: i32x16) -> i16x32; From 5ff287bf8971e12e91c2a5e1f8c54c7f341a1911 Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Wed, 19 Nov 2025 23:31:21 +0000 Subject: [PATCH 17/17] core: Enable avx10_target_feature to support stdarch update --- library/core/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 2dd48ef18369b..ef85e36900869 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -195,6 +195,7 @@ // tidy-alphabetical-start #![feature(aarch64_unstable_target_feature)] #![feature(arm_target_feature)] +#![feature(avx10_target_feature)] #![feature(hexagon_target_feature)] #![feature(loongarch_target_feature)] #![feature(mips_target_feature)]