Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement F16C intrinsics #737

Merged
merged 8 commits into from
May 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
bzip2

RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2
RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2
ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --"
RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2
RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2
ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --"
6 changes: 2 additions & 4 deletions crates/core_arch/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
abi_unadjusted,
adx_target_feature,
rtm_target_feature,
f16c_target_feature,
external_doc
)]
#![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
Expand Down Expand Up @@ -75,7 +76,4 @@ mod core_arch;
pub use self::core_arch::arch::*;

#[allow(unused_imports)]
use core::{ffi, intrinsics, marker, mem, ptr, sync};

#[cfg(test)]
use core::hint;
use core::{ffi, hint, intrinsics, marker, mem, ptr, sync};
4 changes: 4 additions & 0 deletions crates/core_arch/src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ simd_ty!(i32x8[i32]:
| x0, x1, x2, x3, x4, x5, x6, x7);
simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);

simd_ty!(f32x8[f32]:
f32, f32, f32, f32, f32, f32, f32, f32 |
x0, x1, x2, x3, x4, x5, x6, x7);

// 512-bit wide types:

simd_ty!(i32x16[i32]:
Expand Down
134 changes: 134 additions & 0 deletions crates/core_arch/src/x86/f16c.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//! [F16C intrinsics].
//!
//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769

use crate::{
core_arch::{simd::*, x86::*},
hint::unreachable_unchecked,
mem::transmute,
};

#[cfg(test)]
use stdsimd_test::assert_instr;

#[allow(improper_ctypes)]
extern "unadjusted" {
#[link_name = "llvm.x86.vcvtph2ps.128"]
fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
#[link_name = "llvm.x86.vcvtph2ps.256"]
fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
#[link_name = "llvm.x86.vcvtps2ph.128"]
fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
#[link_name = "llvm.x86.vcvtps2ph.256"]
fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
}

/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
/// vector.
#[inline]
#[target_feature(enable = "f16c")]
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 {
transmute(llvm_vcvtph2ps_128(transmute(a)))
}

/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
#[inline]
#[target_feature(enable = "f16c")]
#[cfg_attr(test, assert_instr("vcvtph2ps"))]
pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
transmute(llvm_vcvtph2ps_256(transmute(a)))
}

macro_rules! dispatch_rounding {
($rounding:ident, $call:ident) => {{
match $rounding {
0 => call!(0),
1 => call!(1),
2 => call!(2),
3 => call!(3),
4 => call!(4),
5 => call!(5),
6 => call!(6),
7 => call!(7),
_ => unreachable_unchecked(),
}
}};
}

/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
/// vector.
///
/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
///
/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_128(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
}

/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
/// 16-bit half-precision float values stored in a 128-bit wide vector.
///
/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
///
/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
#[inline]
#[target_feature(enable = "f16c")]
#[rustc_args_required_const(1)]
#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i {
let a = transmute(a);
macro_rules! call {
($rounding:expr) => {
llvm_vcvtps2ph_256(a, $rounding)
};
}
transmute(dispatch_rounding!(imm_rounding, call))
}

#[cfg(test)]
mod tests {
use crate::{core_arch::x86::*, mem::transmute};
use stdsimd_test::simd_test;

#[simd_test(enable = "f16c")]
unsafe fn test_mm_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32];
let float_vec: __m128 = transmute(array);
let halfs: __m128i = _mm_cvtps_ph(float_vec, 0);
let floats: __m128 = _mm_cvtph_ps(halfs);
let result: [f32; 4] = transmute(floats);
assert_eq!(result, array);
}

#[simd_test(enable = "f16c")]
unsafe fn test_mm256_cvtph_ps() {
let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
let float_vec: __m256 = transmute(array);
let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0);
let floats: __m256 = _mm256_cvtph_ps(halfs);
let result: [f32; 8] = transmute(floats);
assert_eq!(result, array);
}
}
3 changes: 3 additions & 0 deletions crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -568,3 +568,6 @@ pub use self::bt::*;

mod rtm;
pub use self::rtm::*;

mod f16c;
pub use self::f16c::*;
1 change: 1 addition & 0 deletions crates/core_arch/src/x86/rtm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub const _XBEGIN_STARTED: u32 = !0;

/// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
/// `_xabort_code(status)`.
#[allow(clippy::identity_op)]
pub const _XABORT_EXPLICIT: u32 = 1 << 0;

/// Transaction retry is possible.
Expand Down
1 change: 1 addition & 0 deletions crates/core_arch/tests/cpu-detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ fn x86_all() {
"avx512_vpopcntdq {:?}",
is_x86_feature_detected!("avx512vpopcntdq")
);
println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
println!("fma: {:?}", is_x86_feature_detected!("fma"));
println!("abm: {:?}", is_x86_feature_detected!("abm"));
println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));
Expand Down
7 changes: 7 additions & 0 deletions crates/std_detect/src/detect/arch/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
/// * `"avx512ifma"`
/// * `"avx512vbmi"`
/// * `"avx512vpopcntdq"`
/// * `"f16c"`
/// * `"fma"`
/// * `"bmi1"`
/// * `"bmi2"`
Expand Down Expand Up @@ -179,6 +180,10 @@ macro_rules! is_x86_feature_detected {
cfg!(target_feature = "avx512vpopcntdq") || $crate::detect::check_for(
$crate::detect::Feature::avx512_vpopcntdq)
};
("f16c") => {
cfg!(target_feature = "f16c") || $crate::detect::check_for(
$crate::detect::Feature::f16c)
};
("fma") => {
cfg!(target_feature = "fma") || $crate::detect::check_for(
$crate::detect::Feature::fma)
Expand Down Expand Up @@ -309,6 +314,8 @@ pub enum Feature {
/// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and
/// Quadword)
avx512_vpopcntdq,
/// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
f16c,
/// FMA (Fused Multiply Add)
fma,
/// BMI1 (Bit Manipulation Instructions 1)
Expand Down
3 changes: 2 additions & 1 deletion crates/std_detect/src/detect/os/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,14 @@ fn detect_features() -> cache::Initializer {
};

enable(proc_info_ecx, 0, Feature::sse3);
enable(proc_info_ecx, 1, Feature::pclmulqdq);
enable(proc_info_ecx, 9, Feature::ssse3);
enable(proc_info_ecx, 13, Feature::cmpxchg16b);
enable(proc_info_ecx, 19, Feature::sse4_1);
enable(proc_info_ecx, 20, Feature::sse4_2);
enable(proc_info_ecx, 23, Feature::popcnt);
enable(proc_info_ecx, 25, Feature::aes);
enable(proc_info_ecx, 1, Feature::pclmulqdq);
enable(proc_info_ecx, 29, Feature::f16c);
enable(proc_info_ecx, 30, Feature::rdrand);
enable(extended_features_ebx, 18, Feature::rdseed);
enable(extended_features_ebx, 19, Feature::adx);
Expand Down
1 change: 1 addition & 0 deletions crates/std_detect/tests/cpu-detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ fn x86_all() {
"avx512_vpopcntdq {:?}",
is_x86_feature_detected!("avx512vpopcntdq")
);
println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
println!("fma: {:?}", is_x86_feature_detected!("fma"));
println!("bmi1: {:?}", is_x86_feature_detected!("bmi1"));
println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));
Expand Down
10 changes: 7 additions & 3 deletions crates/stdsimd-verify/tests/x86-intel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,15 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
.flat_map(|c| c.to_lowercase())
.collect::<String>();

// The XML file names IFMA as "avx512ifma52", while Rust calls
// it "avx512ifma". Fix this mismatch by replacing the Intel
// name with the Rust name.
// Fix mismatching feature names:
let fixup_cpuid = |cpuid: String| match cpuid.as_ref() {
// The XML file names IFMA as "avx512ifma52", while Rust calls
// it "avx512ifma".
"avx512ifma52" => String::from("avx512ifma"),
// See: https://github.com/rust-lang-nursery/stdsimd/issues/738
// The intrinsics guide calls `f16c` `fp16c` in disagreement with
// Intel's architecture manuals.
"fp16c" => String::from("f16c"),
_ => cpuid,
};
let fixed_cpuid = fixup_cpuid(cpuid);
Expand Down