Skip to content

Commit

Permalink
Use simd intrinsics for max and min (#1357)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nugine committed Nov 21, 2022
1 parent 547e3b0 commit 32d6def
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 72 deletions.
72 changes: 36 additions & 36 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1857,7 +1857,9 @@ pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m25
#[cfg_attr(test, assert_instr(vpmaxsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
let a = a.as_i16x16();
let b = b.as_i16x16();
transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
}

/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
Expand All @@ -1869,7 +1871,9 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmaxsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
let a = a.as_i32x8();
let b = b.as_i32x8();
transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
}

/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
Expand All @@ -1881,7 +1885,9 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmaxsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
let a = a.as_i8x32();
let b = b.as_i8x32();
transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
Expand All @@ -1893,7 +1899,9 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmaxuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
let a = a.as_u16x16();
let b = b.as_u16x16();
transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
Expand All @@ -1905,7 +1913,9 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmaxud))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
let a = a.as_u32x8();
let b = b.as_u32x8();
transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
Expand All @@ -1917,7 +1927,9 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmaxub))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
let a = a.as_u8x32();
let b = b.as_u8x32();
transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
}

/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
Expand All @@ -1929,7 +1941,9 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
let a = a.as_i16x16();
let b = b.as_i16x16();
transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
}

/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
Expand All @@ -1941,7 +1955,9 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
let a = a.as_i32x8();
let b = b.as_i32x8();
transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
}

/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
Expand All @@ -1953,7 +1969,9 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
let a = a.as_i8x32();
let b = b.as_i8x32();
transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
Expand All @@ -1965,7 +1983,9 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
let a = a.as_u16x16();
let b = b.as_u16x16();
transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
Expand All @@ -1977,7 +1997,9 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminud))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
transmute(pminud(a.as_u32x8(), b.as_u32x8()))
let a = a.as_u32x8();
let b = b.as_u32x8();
transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
Expand All @@ -1989,7 +2011,9 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpminub))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
transmute(pminub(a.as_u8x32(), b.as_u8x32()))
let a = a.as_u8x32();
let b = b.as_u8x32();
transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
}

/// Creates mask from the most significant bit of each 8-bit element in `a`,
Expand Down Expand Up @@ -3620,30 +3644,6 @@ extern "C" {
fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
#[link_name = "llvm.x86.avx2.maskstore.q.256"]
fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
#[link_name = "llvm.x86.avx2.pmaxs.w"]
fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmaxs.d"]
fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.pmaxs.b"]
fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.pmaxu.w"]
fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pmaxu.d"]
fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.pmaxu.b"]
fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.pmins.w"]
fn pminsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.pmins.d"]
fn pminsd(a: i32x8, b: i32x8) -> i32x8;
#[link_name = "llvm.x86.avx2.pmins.b"]
fn pminsb(a: i8x32, b: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.pminu.w"]
fn pminuw(a: u16x16, b: u16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pminu.d"]
fn pminud(a: u32x8, b: u32x8) -> u32x8;
#[link_name = "llvm.x86.avx2.pminu.b"]
fn pminub(a: u8x32, b: u8x32) -> u8x32;
#[link_name = "llvm.x86.avx2.mpsadbw"]
fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
#[link_name = "llvm.x86.avx2.pmulhu.w"]
Expand Down
24 changes: 12 additions & 12 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,9 @@ pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
let a = a.as_i16x8();
let b = b.as_i16x8();
transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
Expand All @@ -215,7 +217,9 @@ pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxub))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
let a = a.as_u8x16();
let b = b.as_u8x16();
transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
}

/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
Expand All @@ -227,7 +231,9 @@ pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminsw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
let a = a.as_i16x8();
let b = b.as_i16x8();
transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
Expand All @@ -239,7 +245,9 @@ pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminub))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
transmute(pminub(a.as_u8x16(), b.as_u8x16()))
let a = a.as_u8x16();
let b = b.as_u8x16();
transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
}

/// Multiplies the packed 16-bit integers in `a` and `b`.
Expand Down Expand Up @@ -2798,14 +2806,6 @@ extern "C" {
fn pavgw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pmadd.wd"]
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
#[link_name = "llvm.x86.sse2.pmaxs.w"]
fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pmaxu.b"]
fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.pmins.w"]
fn pminsw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pminu.b"]
fn pminub(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.pmulh.w"]
fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pmulhu.w"]
Expand Down
48 changes: 24 additions & 24 deletions crates/core_arch/src/x86/sse41.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,9 @@ pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
let a = a.as_i8x16();
let b = b.as_i8x16();
transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
Expand All @@ -293,7 +295,9 @@ pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
let a = a.as_u16x8();
let b = b.as_u16x8();
transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
}

/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
Expand All @@ -305,7 +309,9 @@ pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
let a = a.as_i32x4();
let b = b.as_i32x4();
transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
}

/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
Expand All @@ -317,7 +323,9 @@ pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmaxud))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
let a = a.as_u32x4();
let b = b.as_u32x4();
transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
}

/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
Expand All @@ -329,7 +337,9 @@ pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminsb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
let a = a.as_i8x16();
let b = b.as_i8x16();
transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
Expand All @@ -341,7 +351,9 @@ pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
let a = a.as_u16x8();
let b = b.as_u16x8();
transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
}

/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
Expand All @@ -353,7 +365,9 @@ pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminsd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
let a = a.as_i32x4();
let b = b.as_i32x4();
transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
}

/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
Expand All @@ -365,7 +379,9 @@ pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pminud))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
transmute(pminud(a.as_u32x4(), b.as_u32x4()))
let a = a.as_u32x4();
let b = b.as_u32x4();
transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
}

/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
Expand Down Expand Up @@ -1122,22 +1138,6 @@ extern "C" {
fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
#[link_name = "llvm.x86.sse41.insertps"]
fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
#[link_name = "llvm.x86.sse41.pmaxsb"]
fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
#[link_name = "llvm.x86.sse41.pmaxuw"]
fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse41.pmaxsd"]
fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.sse41.pmaxud"]
fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
#[link_name = "llvm.x86.sse41.pminsb"]
fn pminsb(a: i8x16, b: i8x16) -> i8x16;
#[link_name = "llvm.x86.sse41.pminuw"]
fn pminuw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse41.pminsd"]
fn pminsd(a: i32x4, b: i32x4) -> i32x4;
#[link_name = "llvm.x86.sse41.pminud"]
fn pminud(a: u32x4, b: u32x4) -> u32x4;
#[link_name = "llvm.x86.sse41.packusdw"]
fn packusdw(a: i32x4, b: i32x4) -> u16x8;
#[link_name = "llvm.x86.sse41.dppd"]
Expand Down

0 comments on commit 32d6def

Please sign in to comment.