Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions crates/core_arch/src/x86/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1833,6 +1833,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovntdq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntdq", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -1861,6 +1862,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntpd", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -1890,6 +1892,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntps", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -4291,6 +4294,7 @@ mod tests {
let a = _mm256_setr_epi64x(1, 2, 3, 4);
let mut r = _mm256_undefined_si256();
_mm256_stream_si256(ptr::addr_of_mut!(r), a);
_mm_sfence();
assert_eq_m256i(r, a);
}

Expand All @@ -4305,6 +4309,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 4] };

_mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..4 {
assert_eq!(mem.data[i], get_m256d(a, i));
}
Expand All @@ -4321,6 +4326,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 8] };

_mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m256(a, i));
}
Expand Down
6 changes: 6 additions & 0 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29593,6 +29593,7 @@ pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask
#[cfg_attr(test, assert_instr(vmovntps))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntps", ",{a}"),
p = in(reg) mem_addr,
Expand All @@ -29619,6 +29620,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
#[cfg_attr(test, assert_instr(vmovntpd))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntpd", ",{a}"),
p = in(reg) mem_addr,
Expand All @@ -29645,6 +29647,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
#[cfg_attr(test, assert_instr(vmovntdq))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("vmovntdq", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -56328,6 +56331,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 16] };

_mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
_mm_sfence();
for i in 0..16 {
assert_eq!(mem.data[i], get_m512(a, i));
}
Expand All @@ -56344,6 +56348,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 8] };

_mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m512d(a, i));
}
Expand All @@ -56360,6 +56365,7 @@ mod tests {
let mut mem = Memory { data: [-1; 8] };

_mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
_mm_sfence();
for i in 0..8 {
assert_eq!(mem.data[i], get_m512i(a, i));
}
Expand Down
2 changes: 2 additions & 0 deletions crates/core_arch/src/x86/sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2022,6 +2022,7 @@ unsafe extern "C" {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntps", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -3329,6 +3330,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 4] };

_mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..4 {
assert_eq!(mem.data[i], get_m128(a, i));
}
Expand Down
7 changes: 7 additions & 0 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1363,6 +1363,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movntdq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntdq", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -1390,6 +1391,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movnti", ",{a:e}"), // `:e` for 32bit value
p = in(reg) mem_addr,
Expand Down Expand Up @@ -2627,6 +2629,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntpd", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -4070,6 +4073,7 @@ mod tests {
);
let mut r = _mm_set1_epi8(0);
_mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
_mm_sfence();
let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
assert_eq_m128i(r, e);
}
Expand Down Expand Up @@ -4106,6 +4110,7 @@ mod tests {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = _mm_undefined_si128();
_mm_stream_si128(ptr::addr_of_mut!(r), a);
_mm_sfence();
assert_eq_m128i(r, a);
}

Expand All @@ -4117,6 +4122,7 @@ mod tests {
let a: i32 = 7;
let mut mem = boxed::Box::<i32>::new(-1);
_mm_stream_si32(ptr::addr_of_mut!(*mem), a);
_mm_sfence();
assert_eq!(a, *mem);
}

Expand Down Expand Up @@ -4813,6 +4819,7 @@ mod tests {
let mut mem = Memory { data: [-1.0; 2] };

_mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
_mm_sfence();
for i in 0..2 {
assert_eq!(mem.data[i], get_m128d(a, i));
}
Expand Down
22 changes: 16 additions & 6 deletions crates/core_arch/src/x86/sse4a.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ unsafe extern "C" {
fn insertq(x: i64x2, y: i64x2) -> i64x2;
#[link_name = "llvm.x86.sse4a.insertqi"]
fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
#[link_name = "llvm.x86.sse4a.movnt.sd"]
fn movntsd(x: *mut f64, y: __m128d);
#[link_name = "llvm.x86.sse4a.movnt.ss"]
fn movntss(x: *mut f32, y: __m128);
}

/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
Expand Down Expand Up @@ -114,7 +110,13 @@ pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i)
#[cfg_attr(test, assert_instr(movntsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
movntsd(p, a);
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntsd", ",{a}"),
p = in(reg) p,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Non-temporal store of `a.0` into `p`.
Expand All @@ -134,7 +136,13 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
#[cfg_attr(test, assert_instr(movntss))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
movntss(p, a);
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movntss", ",{a}"),
p = in(reg) p,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}

#[cfg(test)]
Expand Down Expand Up @@ -209,6 +217,7 @@ mod tests {
let x = _mm_setr_pd(3.0, 4.0);

_mm_stream_sd(d, x);
_mm_sfence();
}
assert_eq!(mem.data[0], 3.0);
assert_eq!(mem.data[1], 2.0);
Expand All @@ -234,6 +243,7 @@ mod tests {
let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);

_mm_stream_ss(d, x);
_mm_sfence();
}
assert_eq!(mem.data[0], 5.0);
assert_eq!(mem.data[1], 2.0);
Expand Down
2 changes: 2 additions & 0 deletions crates/core_arch/src/x86_64/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
crate::arch::asm!(
vps!("movnti", ",{a}"),
p = in(reg) mem_addr,
Expand Down Expand Up @@ -200,6 +201,7 @@ mod tests {
let a: i64 = 7;
let mut mem = boxed::Box::<i64>::new(-1);
_mm_stream_si64(ptr::addr_of_mut!(*mem), a);
_mm_sfence();
assert_eq!(a, *mem);
}

Expand Down