Shyrma fix (deeplearning4j#6930)

* provide intrinsic gnu gcc function for casting float -> float16 * f16c support for avx2 builds * meh * another var name * check whether correct functions are coalled while float <--> float16 cast
printomi · Jan 7, 2019 · 2382615 · 2382615
1 parent 38ac914
commit 2382615
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 6 deletions.
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
@@ -51,7 +51,7 @@ IF(${ARCH} MATCHES "arm*")
 ELSEIF(${ARCH} MATCHES "power*")
     set(ARCH_TUNE "-mcpu=${ARCH} -mtune=${ARCH} -D__POWER")
 ELSEIF(${EXTENSION} MATCHES "avx2")
-    set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH} -msse4.1 -msse4.2 -mavx -mavx2 -mfma")
+    set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH} -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -D__F16C__=true")
 ELSE()
     if (${ARCH} STREQUAL "x86-64")
         set(ARCH_TYPE "generic")

diff --git a/libnd4j/include/types/float16.h b/libnd4j/include/types/float16.h
@@ -27,6 +27,9 @@
 #include <iosfwd>
 #include <iostream>
 #include <pointercast.h>
+#if defined(__INTEL_COMPILER) || defined(__F16C__)
+    #include <immintrin.h>
+#endif
 
 // support for half precision conversion
 #ifdef __INTEL_COMPILER
@@ -116,7 +119,7 @@ typedef __half ihalf;
 #include <fp16_emu.h>
 
 
-#ifdef __INTEL_COMPILER
+#if defined(__INTEL_COMPILER) || defined(__F16C__)
 //_Pragma("omp declare simd") inline
 local_def  float cpu_ihalf2float(ihalf h) {
     return _cvtsh_ss(h.getX());
@@ -151,11 +154,12 @@ local_def float cpu_ihalf2float(ihalf h) {
 }
 #endif
 
-#ifdef __INTEL_COMPILER
+#if defined(__INTEL_COMPILER) || defined(__F16C__)
 //_Pragma("omp declare simd") inline
 local_def ihalf cpu_float2ihalf_rn(float f) {
     ihalf ret;
     ret.x = _cvtss_sh(f, 0);
+
     return ret;
 }
 

diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
@@ -55,9 +55,9 @@ TEST_F(DataTypesValidationTests, Basic_Test_1) {
 }
 
 TEST_F(DataTypesValidationTests, Basic_Test_2) {
-    auto input = NDArrayFactory::create<int8_t>('c', {1, 1, 1, 4});
-    auto weights = NDArrayFactory::create<float16>('c', {1, 1, 1, 4});
-    auto exp = NDArrayFactory::create<float16>('c', {1, 4, 1, 4}, {2., 4., 6., 8., 2., 4., 6., 8., 2., 4., 6., 8., 2., 4., 6., 8.});
+    auto input = NDArrayFactory::create<uint8_t>('c', {1, 1, 1, 4});
+    auto weights = NDArrayFactory::create<float>('c', {1, 1, 1, 4});
+    auto exp = NDArrayFactory::create<float>('c', {1, 4, 1, 4}, {2., 4., 6., 8., 2., 4., 6., 8., 2., 4., 6., 8., 2., 4., 6., 8.});
 
     weights.assign(2.0);
     input.linspace(1);
@@ -103,3 +103,12 @@ TEST_F(DataTypesValidationTests, Basic_Test_4) {
     auto result = op.execute({&input, &weights}, {&out}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, {});
     ASSERT_EQ(ND4J_STATUS_VALIDATION, result);
 }
+
+TEST_F(DataTypesValidationTests, cast_1) {
+
+    float16 x = static_cast<float16>(1.f);
+    float y = static_cast<float16>(x);
+
+    ASSERT_TRUE(1.f == x);
+    ASSERT_TRUE(y == x);
+}
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -44,6 +44,10 @@ else()
     else()
         set(CMAKE_CXX_FLAGS  " -g -O0 -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fsanitize=address")
     endif()
+
+    if (${F16C})
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mf16c -D__F16C__=true")
+    endif()
 endif()
 
 if ("${EXPERIMENTAL}" STREQUAL "yes")