rust-lang · gnzlbg · May 9, 2019 · Apr 23, 2019 · Apr 23, 2019 · Apr 23, 2019
diff --git a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
@@ -8,6 +8,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
   wget \
   bzip2
 
-RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2
-RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2
-ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --"
+RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2
+RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --"
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
@@ -32,6 +32,7 @@
     abi_unadjusted,
     adx_target_feature,
     rtm_target_feature,
+    f16c_target_feature,
     external_doc
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
@@ -75,7 +76,4 @@ mod core_arch;
 pub use self::core_arch::arch::*;
 
 #[allow(unused_imports)]
-use core::{ffi, intrinsics, marker, mem, ptr, sync};
-
-#[cfg(test)]
-use core::hint;
+use core::{ffi, hint, intrinsics, marker, mem, ptr, sync};
diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
@@ -184,6 +184,10 @@ simd_ty!(i32x8[i32]:
          | x0, x1, x2, x3, x4, x5, x6, x7);
 simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);
 
+simd_ty!(f32x8[f32]:
+         f32, f32, f32, f32, f32, f32, f32, f32 |
+         x0, x1, x2, x3, x4, x5, x6, x7);
+
 // 512-bit wide types:
 
 simd_ty!(i32x16[i32]:

diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs
@@ -0,0 +1,134 @@
+//! [F16C intrinsics].
+//!
+//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769
+
+use crate::{
+    core_arch::{simd::*, x86::*},
+    hint::unreachable_unchecked,
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.x86.vcvtph2ps.128"]
+    fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4;
+    #[link_name = "llvm.x86.vcvtph2ps.256"]
+    fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8;
+    #[link_name = "llvm.x86.vcvtps2ph.128"]
+    fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8;
+    #[link_name = "llvm.x86.vcvtps2ph.256"]
+    fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8;
+}
+
+/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of
+/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide
+/// vector.
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 {
+    transmute(llvm_vcvtph2ps_128(transmute(a)))
+}
+
+/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector
+/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector.
+#[inline]
+#[target_feature(enable = "f16c")]
+#[cfg_attr(test, assert_instr("vcvtph2ps"))]
+pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 {
+    transmute(llvm_vcvtph2ps_256(transmute(a)))
+}
+
+macro_rules! dispatch_rounding {
+    ($rounding:ident, $call:ident) => {{
+        match $rounding {
+            0 => call!(0),
+            1 => call!(1),
+            2 => call!(2),
+            3 => call!(3),
+            4 => call!(4),
+            5 => call!(5),
+            6 => call!(6),
+            7 => call!(7),
+            _ => unreachable_unchecked(),
+        }
+    }};
+}
+
+/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x
+/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit
+/// vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
+/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
+/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
+/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
+/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
+#[inline]
+#[target_feature(enable = "f16c")]
+#[rustc_args_required_const(1)]
+#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
+pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i {
+    let a = transmute(a);
+    macro_rules! call {
+        ($rounding:expr) => {
+            llvm_vcvtps2ph_128(a, $rounding)
+        };
+    }
+    transmute(dispatch_rounding!(imm_rounding, call))
+}
+
+/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x
+/// 16-bit half-precision float values stored in a 128-bit wide vector.
+///
+/// Rounding is done according to the `imm_rounding` parameter, which can be one of:
+///
+/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions,
+/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions,
+/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions,
+/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions,
+/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`].
+#[inline]
+#[target_feature(enable = "f16c")]
+#[rustc_args_required_const(1)]
+#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))]
+pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i {
+    let a = transmute(a);
+    macro_rules! call {
+        ($rounding:expr) => {
+            llvm_vcvtps2ph_256(a, $rounding)
+        };
+    }
+    transmute(dispatch_rounding!(imm_rounding, call))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{core_arch::x86::*, mem::transmute};
+    use stdsimd_test::simd_test;
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm_cvtph_ps() {
+        let array = [1_f32, 2_f32, 3_f32, 4_f32];
+        let float_vec: __m128 = transmute(array);
+        let halfs: __m128i = _mm_cvtps_ph(float_vec, 0);
+        let floats: __m128 = _mm_cvtph_ps(halfs);
+        let result: [f32; 4] = transmute(floats);
+        assert_eq!(result, array);
+    }
+
+    #[simd_test(enable = "f16c")]
+    unsafe fn test_mm256_cvtph_ps() {
+        let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32];
+        let float_vec: __m256 = transmute(array);
+        let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0);
+        let floats: __m256 = _mm256_cvtph_ps(halfs);
+        let result: [f32; 8] = transmute(floats);
+        assert_eq!(result, array);
+    }
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
@@ -568,3 +568,6 @@ pub use self::bt::*;
 
 mod rtm;
 pub use self::rtm::*;
+
+mod f16c;
+pub use self::f16c::*;
diff --git a/crates/core_arch/src/x86/rtm.rs b/crates/core_arch/src/x86/rtm.rs
@@ -32,6 +32,7 @@ pub const _XBEGIN_STARTED: u32 = !0;
 
 /// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with
 /// `_xabort_code(status)`.
+#[allow(clippy::identity_op)]
 pub const _XABORT_EXPLICIT: u32 = 1 << 0;
 
 /// Transaction retry is possible.

diff --git a/crates/core_arch/tests/cpu-detection.rs b/crates/core_arch/tests/cpu-detection.rs
@@ -31,6 +31,7 @@ fn x86_all() {
         "avx512_vpopcntdq {:?}",
         is_x86_feature_detected!("avx512vpopcntdq")
     );
+    println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
     println!("fma: {:?}", is_x86_feature_detected!("fma"));
     println!("abm: {:?}", is_x86_feature_detected!("abm"));
     println!("bmi: {:?}", is_x86_feature_detected!("bmi1"));

diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
@@ -62,6 +62,7 @@
 /// * `"avx512ifma"`
 /// * `"avx512vbmi"`
 /// * `"avx512vpopcntdq"`
+/// * `"f16c"`
 /// * `"fma"`
 /// * `"bmi1"`
 /// * `"bmi2"`
@@ -179,6 +180,10 @@ macro_rules! is_x86_feature_detected {
         cfg!(target_feature = "avx512vpopcntdq") || $crate::detect::check_for(
             $crate::detect::Feature::avx512_vpopcntdq)
     };
+    ("f16c") => {
+        cfg!(target_feature = "f16c") || $crate::detect::check_for(
+            $crate::detect::Feature::f16c)
+    };
     ("fma") => {
         cfg!(target_feature = "fma") || $crate::detect::check_for(
             $crate::detect::Feature::fma)
@@ -309,6 +314,8 @@ pub enum Feature {
     /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and
     /// Quadword)
     avx512_vpopcntdq,
+    /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
+    f16c,
     /// FMA (Fused Multiply Add)
     fma,
     /// BMI1 (Bit Manipulation Instructions 1)

diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs
@@ -113,13 +113,14 @@ fn detect_features() -> cache::Initializer {
         };
 
         enable(proc_info_ecx, 0, Feature::sse3);
+        enable(proc_info_ecx, 1, Feature::pclmulqdq);
         enable(proc_info_ecx, 9, Feature::ssse3);
         enable(proc_info_ecx, 13, Feature::cmpxchg16b);
         enable(proc_info_ecx, 19, Feature::sse4_1);
         enable(proc_info_ecx, 20, Feature::sse4_2);
         enable(proc_info_ecx, 23, Feature::popcnt);
         enable(proc_info_ecx, 25, Feature::aes);
-        enable(proc_info_ecx, 1, Feature::pclmulqdq);
+        enable(proc_info_ecx, 29, Feature::f16c);
         enable(proc_info_ecx, 30, Feature::rdrand);
         enable(extended_features_ebx, 18, Feature::rdseed);
         enable(extended_features_ebx, 19, Feature::adx);

diff --git a/crates/std_detect/tests/cpu-detection.rs b/crates/std_detect/tests/cpu-detection.rs
@@ -87,6 +87,7 @@ fn x86_all() {
         "avx512_vpopcntdq {:?}",
         is_x86_feature_detected!("avx512vpopcntdq")
     );
+    println!("f16c: {:?}", is_x86_feature_detected!("f16c"));
     println!("fma: {:?}", is_x86_feature_detected!("fma"));
     println!("bmi1: {:?}", is_x86_feature_detected!("bmi1"));
     println!("bmi2: {:?}", is_x86_feature_detected!("bmi2"));

diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs
@@ -293,11 +293,15 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             .flat_map(|c| c.to_lowercase())
             .collect::<String>();
 
-        // The XML file names IFMA as "avx512ifma52", while Rust calls
-        // it "avx512ifma".  Fix this mismatch by replacing the Intel
-        // name with the Rust name.
+        // Fix mismatching feature names:
         let fixup_cpuid = |cpuid: String| match cpuid.as_ref() {
+            // The XML file names IFMA as "avx512ifma52", while Rust calls
+            // it "avx512ifma".
             "avx512ifma52" => String::from("avx512ifma"),
+            // See: https://github.com/rust-lang-nursery/stdsimd/issues/738
+            // The intrinsics guide calls `f16c` `fp16c` in disagreement with
+            // Intel's architecture manuals.
+            "fp16c" => String::from("f16c"),
             _ => cpuid,
         };
         let fixed_cpuid = fixup_cpuid(cpuid);