From 480aafcd781254d633e11e502fdcfabb6036844c Mon Sep 17 00:00:00 2001 From: sayantn Date: Thu, 2 Oct 2025 00:41:58 +0530 Subject: [PATCH 1/3] Use Inline ASM for SSE4a nontemporal stores --- crates/core_arch/src/x86/sse4a.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/sse4a.rs b/crates/core_arch/src/x86/sse4a.rs index 051b77d02d..b9692a2783 100644 --- a/crates/core_arch/src/x86/sse4a.rs +++ b/crates/core_arch/src/x86/sse4a.rs @@ -15,10 +15,6 @@ unsafe extern "C" { fn insertq(x: i64x2, y: i64x2) -> i64x2; #[link_name = "llvm.x86.sse4a.insertqi"] fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2; - #[link_name = "llvm.x86.sse4a.movnt.sd"] - fn movntsd(x: *mut f64, y: __m128d); - #[link_name = "llvm.x86.sse4a.movnt.ss"] - fn movntss(x: *mut f32, y: __m128); } /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. @@ -114,7 +110,12 @@ pub fn _mm_inserti_si64(x: __m128i, y: __m128i) #[cfg_attr(test, assert_instr(movntsd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { - movntsd(p, a); + crate::arch::asm!( + vps!("movntsd", ",{a}"), + p = in(reg) p, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); } /// Non-temporal store of `a.0` into `p`. @@ -134,7 +135,12 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { #[cfg_attr(test, assert_instr(movntss))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { - movntss(p, a); + crate::arch::asm!( + vps!("movntss", ",{a}"), + p = in(reg) p, + a = in(xmm_reg) a, + options(nostack, preserves_flags), + ); } #[cfg(test)] From 45a8896b9ddc497654686bfd026717802baa124b Mon Sep 17 00:00:00 2001 From: sayantn Date: Thu, 2 Oct 2025 00:42:57 +0530 Subject: [PATCH 2/3] Add `_mm_sfence` to all non-temporal intrinsic tests --- crates/core_arch/src/x86/avx.rs | 3 +++ crates/core_arch/src/x86/avx512f.rs | 3 +++ crates/core_arch/src/x86/sse.rs | 1 + crates/core_arch/src/x86/sse2.rs | 4 ++++ crates/core_arch/src/x86/sse4a.rs | 2 ++ crates/core_arch/src/x86_64/sse2.rs | 1 + 6 files changed, 14 insertions(+) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index c1bb897ce0..d0821a4e3f 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -4291,6 +4291,7 @@ mod tests { let a = _mm256_setr_epi64x(1, 2, 3, 4); let mut r = _mm256_undefined_si256(); _mm256_stream_si256(ptr::addr_of_mut!(r), a); + _mm_sfence(); assert_eq_m256i(r, a); } @@ -4305,6 +4306,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 4] }; _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a); + _mm_sfence(); for i in 0..4 { assert_eq!(mem.data[i], get_m256d(a, i)); } @@ -4321,6 +4323,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 8] }; _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a); + _mm_sfence(); for i in 0..8 { assert_eq!(mem.data[i], get_m256(a, i)); } diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7f8f6b9cda..743dabf798 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -56328,6 +56328,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 16] }; _mm512_stream_ps(&mut mem.data[0] as *mut f32, a); + _mm_sfence(); for i in 0..16 { assert_eq!(mem.data[i], get_m512(a, i)); } @@ -56344,6 +56345,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 8] }; _mm512_stream_pd(&mut mem.data[0] as *mut f64, a); + _mm_sfence(); for i in 0..8 { assert_eq!(mem.data[i], get_m512d(a, i)); } @@ -56360,6 +56362,7 @@ mod tests { let mut mem = Memory { data: [-1; 8] }; _mm512_stream_si512(mem.data.as_mut_ptr().cast(), a); + _mm_sfence(); for i in 0..8 { assert_eq!(mem.data[i], get_m512i(a, i)); } diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index c5c6dc26b5..f47f9242ea 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -3329,6 +3329,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 4] }; _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a); + _mm_sfence(); for i in 0..4 { assert_eq!(mem.data[i], get_m128(a, i)); } diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index c9530a237a..aad79f28cc 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -4070,6 +4070,7 @@ mod tests { ); let mut r = _mm_set1_epi8(0); _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8); + _mm_sfence(); let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m128i(r, e); } @@ -4106,6 +4107,7 @@ mod tests { let a = _mm_setr_epi32(1, 2, 3, 4); let mut r = _mm_undefined_si128(); _mm_stream_si128(ptr::addr_of_mut!(r), a); + _mm_sfence(); assert_eq_m128i(r, a); } @@ -4117,6 +4119,7 @@ mod tests { let a: i32 = 7; let mut mem = boxed::Box::::new(-1); _mm_stream_si32(ptr::addr_of_mut!(*mem), a); + _mm_sfence(); assert_eq!(a, *mem); } @@ -4813,6 +4816,7 @@ mod tests { let mut mem = Memory { data: [-1.0; 2] }; _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a); + _mm_sfence(); for i in 0..2 { assert_eq!(mem.data[i], get_m128d(a, i)); } diff --git a/crates/core_arch/src/x86/sse4a.rs b/crates/core_arch/src/x86/sse4a.rs index b9692a2783..fc0af10f9d 100644 --- a/crates/core_arch/src/x86/sse4a.rs +++ b/crates/core_arch/src/x86/sse4a.rs @@ -215,6 +215,7 @@ mod tests { let x = _mm_setr_pd(3.0, 4.0); _mm_stream_sd(d, x); + _mm_sfence(); } assert_eq!(mem.data[0], 3.0); assert_eq!(mem.data[1], 2.0); @@ -240,6 +241,7 @@ mod tests { let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); _mm_stream_ss(d, x); + _mm_sfence(); } assert_eq!(mem.data[0], 5.0); assert_eq!(mem.data[1], 2.0); diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index 475e2d2a83..464f9ca4e1 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -200,6 +200,7 @@ mod tests { let a: i64 = 7; let mut mem = boxed::Box::::new(-1); _mm_stream_si64(ptr::addr_of_mut!(*mem), a); + _mm_sfence(); assert_eq!(a, *mem); } From 0a246948bf241c3daccafbbd152a966115ee52d4 Mon Sep 17 00:00:00 2001 From: sayantn Date: Sun, 5 Oct 2025 07:04:36 +0530 Subject: [PATCH 3/3] Add comments in NT asm blocks for future reference --- crates/core_arch/src/x86/avx.rs | 3 +++ crates/core_arch/src/x86/avx512f.rs | 3 +++ crates/core_arch/src/x86/sse.rs | 1 + crates/core_arch/src/x86/sse2.rs | 3 +++ crates/core_arch/src/x86/sse4a.rs | 2 ++ crates/core_arch/src/x86_64/sse2.rs | 1 + 6 files changed, 13 insertions(+) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index d0821a4e3f..c2c2febf18 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -1833,6 +1833,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vmovntdq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntdq", ",{a}"), p = in(reg) mem_addr, @@ -1861,6 +1862,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntpd", ",{a}"), p = in(reg) mem_addr, @@ -1890,6 +1892,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntps", ",{a}"), p = in(reg) mem_addr, diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 743dabf798..b60df7dbc9 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -29593,6 +29593,7 @@ pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask #[cfg_attr(test, assert_instr(vmovntps))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntps", ",{a}"), p = in(reg) mem_addr, @@ -29619,6 +29620,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { #[cfg_attr(test, assert_instr(vmovntpd))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntpd", ",{a}"), p = in(reg) mem_addr, @@ -29645,6 +29647,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { #[cfg_attr(test, assert_instr(vmovntdq))] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("vmovntdq", ",{a}"), p = in(reg) mem_addr, diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index f47f9242ea..be5ce8191a 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -2022,6 +2022,7 @@ unsafe extern "C" { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movntps", ",{a}"), p = in(reg) mem_addr, diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index aad79f28cc..2bdadd0b4b 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1363,6 +1363,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) { #[cfg_attr(test, assert_instr(movntdq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movntdq", ",{a}"), p = in(reg) mem_addr, @@ -1390,6 +1391,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movnti", ",{a:e}"), // `:e` for 32bit value p = in(reg) mem_addr, @@ -2627,6 +2629,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d { #[stable(feature = "simd_x86", since = "1.27.0")] #[allow(clippy::cast_ptr_alignment)] pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movntpd", ",{a}"), p = in(reg) mem_addr, diff --git a/crates/core_arch/src/x86/sse4a.rs b/crates/core_arch/src/x86/sse4a.rs index fc0af10f9d..7978d018e4 100644 --- a/crates/core_arch/src/x86/sse4a.rs +++ b/crates/core_arch/src/x86/sse4a.rs @@ -110,6 +110,7 @@ pub fn _mm_inserti_si64(x: __m128i, y: __m128i) #[cfg_attr(test, assert_instr(movntsd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movntsd", ",{a}"), p = in(reg) p, @@ -135,6 +136,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { #[cfg_attr(test, assert_instr(movntss))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movntss", ",{a}"), p = in(reg) p, diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs index 464f9ca4e1..0894aa9810 100644 --- a/crates/core_arch/src/x86_64/sse2.rs +++ b/crates/core_arch/src/x86_64/sse2.rs @@ -78,6 +78,7 @@ pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 { #[cfg_attr(test, assert_instr(movnti))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) { + // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough crate::arch::asm!( vps!("movnti", ",{a}"), p = in(reg) mem_addr,