Skip to content
Permalink
Browse files

Merge branch 'release_1_4'

  • Loading branch information...
Michal Babej
Michal Babej committed Sep 24, 2019
2 parents 3d5f25e + e8af981 commit d893992b57f0c900cb99cab2cef2e8053c7f58c4
@@ -14,19 +14,10 @@ matrix:
compiler: gcc
env: LLVM_VERSION=6.0 HWLOC_VERSION=1.11 DOCKERFILE=Ubuntu/16_04.64bit

- os: linux
compiler: gcc
env: LLVM_VERSION=6.0 HWLOC_VERSION=2.0 CONDA=True

- os: osx
compiler: clang
env: LLVM_VERSION=6.0 HWLOC_VERSION=2.0 CONDA=True

- os: osx
compiler: gcc
env: LLVM_VERSION=4.0 HWLOC_VERSION=1.11 CONDA=True


before_install:
- if [ "$TRAVIS_OS_NAME" = "osx" ] ; then
export MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh";
@@ -509,6 +509,10 @@ if(UNIX)
"unistd.h"
HAVE_SLEEP)

CHECK_SYMBOL_EXISTS("getrlimit"
"sys/time.h;sys/resource.h"
HAVE_GETRLIMIT)

CHECK_SYMBOL_EXISTS("utime"
"sys/types.h;utime.h"
HAVE_UTIME)
@@ -569,6 +573,7 @@ else()
set(HAVE_MKDTEMP 0)
set(HAVE_FUTIMENS 0)
set(HAVE_FORK 0)
set(HAVE_GETRLIMIT 0)
set(HAVE_VFORK 0)
set(HAVE_UTIME 0)
set(HAVE_DLADDR 0)
@@ -227,7 +227,7 @@ set(CLANG_LIBNAMES clangCodeGen clangFrontendTool clangFrontend clangDriver clan
foreach(LIBNAME ${CLANG_LIBNAMES})
find_library(C_LIBFILE_${LIBNAME} NAMES "${LIBNAME}" HINTS "${LLVM_LIBDIR}")
list(APPEND CLANG_LIBFILES "${C_LIBFILE_${LIBNAME}}")
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(UNIX AND (NOT APPLE))
set(LLVM_LDFLAGS "${LLVM_LDFLAGS} -Wl,--exclude-libs,lib${LIBNAME}")
endif()
endforeach()
@@ -341,6 +341,11 @@ macro(custom_try_compile_clang SOURCE1 SOURCE2 RES_VAR)
custom_try_compile_c_cxx("${CLANG}" "c" "${SOURCE1}" "${SOURCE2}" ${RES_VAR} "-c" ${ARGN})
endmacro()

# clang++ try-compile macro
macro(custom_try_compile_clang_silent SOURCE1 SOURCE2 RES_VAR)
custom_try_compile_c_cxx_silent("${CLANG}" "c" "${SOURCE1}" "${SOURCE2}" ${RES_VAR} "-c" ${ARGN})
endmacro()

# clang++ try-link macro
macro(custom_try_link_clang SOURCE1 SOURCE2 RES_VAR)
set(RANDOM_FILENAME "${CMAKE_BINARY_DIR}/compile_test_${RNDNAME}.${SUFFIX}")
@@ -489,15 +494,6 @@ if(NOT DEFINED CLANG_NEEDS_RTLIB)

endif()

####################################################################
#X86 has -march and -mcpu reversed, for clang

if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(powerpc|arm|aarch64)")
set(CLANG_MARCH_FLAG "-mcpu=")
else()
set(CLANG_MARCH_FLAG "-march=")
endif()

####################################################################

macro(CHECK_ALIGNOF TYPE TYPEDEF OUT_VAR)
@@ -702,6 +698,26 @@ else()
endif()


####################################################################
# Some architectures have -march and -mcpu reversed

if(NOT DEFINED ${CLANG_MARCH_FLAG})
message(STATUS "Checking clang -march vs. -mcpu flag")
custom_try_compile_clang_silent("" "return 0;" RES ${CLANG_TARGET_OPTION}${LLC_TRIPLE} -march=${LLC_HOST_CPU})
if(NOT RES)
set(CLANG_MARCH_FLAG "-march=")
else()
custom_try_compile_clang_silent("" "return 0;" RES ${CLANG_TARGET_OPTION}${LLC_TRIPLE} -mcpu=${LLC_HOST_CPU})
if(NOT RES)
set(CLANG_MARCH_FLAG "-mcpu=")
else()
message(FATAL_ERROR "Could not determine whether to use -march or -mcpu with clang")
endif()
endif()

set(CLANG_MARCH_FLAG ${CLANG_MARCH_FLAG} CACHE INTERNAL "Clang option used to specify the target cpu")
endif()

####################################################################

# This tests that we can actually link to the llvm libraries.
@@ -760,7 +776,7 @@ endif()
if(ENABLE_HOST_CPU_DEVICES AND NOT DEFINED ${CL_DISABLE_HALF})
set(CL_DISABLE_HALF 0)
message(STATUS "Checking fp16 support")
custom_try_compile_c_cxx_silent("${CLANG}" "c" "__fp16 callfp16(__fp16 a) { return a * (__fp16)1.8; };" "__fp16 x=callfp16((__fp16)argc);" RESV -c ${CLANG_TARGET_OPTION}${LLC_TRIPLE} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU})
custom_try_compile_clang_silent("__fp16 callfp16(__fp16 a) { return a * (__fp16)1.8; };" "__fp16 x=callfp16((__fp16)argc);" RESV ${CLANG_TARGET_OPTION}${LLC_TRIPLE} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU})
if(RESV)
set(CL_DISABLE_HALF 1)
endif()
@@ -45,6 +45,8 @@

#cmakedefine HAVE_FSYNC

#cmakedefine HAVE_GETRLIMIT

#cmakedefine HAVE_MKOSTEMPS

#cmakedefine HAVE_MKSTEMPS
@@ -35,12 +35,8 @@
#include <unistd.h>
#include <utlist.h>

#ifndef _MSC_VER
# include <sys/time.h>
# include <sys/resource.h>
# include <unistd.h>
#else
# include "vccompat.hpp"
#ifdef _MSC_VER
#include "vccompat.hpp"
#endif

#include "common.h"
@@ -57,6 +53,12 @@
#include "pocl_runtime_config.h"
#include "pocl_util.h"

#ifdef HAVE_GETRLIMIT
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#endif

#ifdef HAVE_LIBDL
#if defined(__APPLE__)
#define _DARWIN_C_SOURCE
@@ -1123,8 +1125,6 @@ static pocl_global_mem_t system_memory = {POCL_LOCK_INITIALIZER, 0, 0, 0};
void
pocl_setup_device_for_system_memory (cl_device_id device)
{
int limit_memory_gb = pocl_get_int_option ("POCL_MEMORY_LIMIT", 0);

/* set up system memory limits, if required */
if (system_memory.total_alloc_limit == 0)
{
@@ -1133,31 +1133,46 @@ pocl_setup_device_for_system_memory (cl_device_id device)
* this sets it to 3/4 for systems with <=7gig mem,
* for >7 it sets to (total-2gigs)
*/
size_t alloc_limit = device->global_mem_size;
if ((alloc_limit >> 20) > (7 << 10))
system_memory.total_alloc_limit = alloc_limit - (size_t)(1UL << 31);
cl_ulong alloc_limit = device->global_mem_size;
if (alloc_limit > ((cl_ulong)7 << 30))
system_memory.total_alloc_limit = alloc_limit - ((cl_ulong)2 << 30);
else
{
size_t temp = (alloc_limit >> 2);
cl_ulong temp = (alloc_limit >> 2);
system_memory.total_alloc_limit = alloc_limit - temp;
}

system_memory.max_ever_allocated =
system_memory.currently_allocated = 0;

/* in some cases (e.g. ARM32 pocl on ARM64 system with >4G ram),
* global memory is correctly reported but larger than can be
* used; limit to pointer size */
if (system_memory.total_alloc_limit > UINTPTR_MAX)
system_memory.total_alloc_limit = UINTPTR_MAX;

/* apply rlimit settings */
#ifdef HAVE_GETRLIMIT
struct rlimit limits;
int ret = getrlimit (RLIMIT_DATA, &limits);
if ((ret == 0) && (system_memory.total_alloc_limit > limits.rlim_cur))
system_memory.total_alloc_limit = limits.rlim_cur;
#endif
}

device->global_mem_size = system_memory.total_alloc_limit;

int limit_memory_gb = pocl_get_int_option ("POCL_MEMORY_LIMIT", 0);
if (limit_memory_gb > 0)
{
size_t limited_memory = (size_t)limit_memory_gb << 30;
cl_ulong limited_memory = (cl_ulong)limit_memory_gb << 30;
if (device->global_mem_size > limited_memory)
device->global_mem_size = limited_memory;
else
POCL_MSG_WARN ("requested POCL_MEMORY_LIMIT %i GBs is larger than"
" physical memory size (%zu) GBs, ignoring\n",
" physical memory size (%u) GBs, ignoring\n",
limit_memory_gb,
(size_t) (device->global_mem_size >> 30));
(unsigned)(device->global_mem_size >> 30));
}

if (device->global_mem_size < MIN_MAX_MEM_ALLOC_SIZE)
@@ -1167,22 +1182,8 @@ pocl_setup_device_for_system_memory (cl_device_id device)
* can potentially allocate the whole memory for a single buffer, unless
* of course there are limits set at the operating system level. Of course
* we still have to respect the OpenCL-commanded minimum */
size_t alloc_limit = SIZE_MAX;

#ifndef _MSC_VER
// TODO getrlimit equivalent under Windows
struct rlimit limits;
int ret = getrlimit(RLIMIT_DATA, &limits);
if (ret == 0)
alloc_limit = limits.rlim_cur;
else
#endif
alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;

if (alloc_limit > device->global_mem_size)
alloc_limit = pocl_size_ceil2 (device->global_mem_size / 4);
if (alloc_limit > (device->global_mem_size / 2))
alloc_limit >>= 1;
cl_ulong alloc_limit = pocl_size_ceil2_64 (device->global_mem_size / 4);

if (alloc_limit < MIN_MAX_MEM_ALLOC_SIZE)
alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
@@ -1214,11 +1215,11 @@ pocl_set_buffer_image_limits(cl_device_id device)
* try to allocate max size constant objects and run out of memory
* while trying to fill them. */

size_t s;
cl_ulong s;
if (device->global_mem_cache_size > 0)
s = pocl_size_ceil2 (device->global_mem_cache_size / 2);
s = pocl_size_ceil2_64 (device->global_mem_cache_size / 2);
else
s = pocl_size_ceil2 (device->global_mem_size / 256);
s = pocl_size_ceil2_64 (device->global_mem_size / 256);

device->local_mem_size = device->max_constant_buffer_size = s;

@@ -1302,12 +1303,13 @@ void
pocl_print_system_memory_stats()
{
POCL_MSG_PRINT_F (MEMORY, INFO, "",
"____ Total available system memory : %10zu KB\n"
" ____ Currently used system memory : %10zu KB\n"
" ____ Max used system memory : %10zu KB\n",
system_memory.total_alloc_limit >> 10,
system_memory.currently_allocated >> 10,
system_memory.max_ever_allocated >> 10);
"____ Total available system memory : %10" PRIu64 " KB\n"
" ____ Currently used system memory : %10" PRIu64 " KB\n"
" ____ Max used system memory : %10" PRIu64
" KB\n",
system_memory.total_alloc_limit >> 10,
system_memory.currently_allocated >> 10,
system_memory.max_ever_allocated >> 10);
}

/* Unique hash for a device + program build + kernel name combination.
@@ -625,9 +625,9 @@ struct pocl_device_ops {

typedef struct pocl_global_mem_t {
pocl_lock_t pocl_lock;
size_t max_ever_allocated;
size_t currently_allocated;
size_t total_alloc_limit;
cl_ulong max_ever_allocated;
cl_ulong currently_allocated;
cl_ulong total_alloc_limit;
} pocl_global_mem_t;

#define NUM_OPENCL_IMAGE_TYPES 6
@@ -221,6 +221,24 @@ pocl_size_ceil2(size_t x) {
return ++x;
}

uint64_t
pocl_size_ceil2_64 (uint64_t x)
{
/* Rounds up to the next highest power of two without branching and
* is as fast as a BSR instruction on x86, see:
*
* http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
*/
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x |= x >> 32;
return ++x;
}

static void*
pocl_memalign_alloc(size_t align_width, size_t size)
{
@@ -69,7 +69,8 @@ unsigned pocl_save_ftz ();
void pocl_restore_ftz (unsigned ftz);

/* Finds the next highest power of two of the given value. */
size_t pocl_size_ceil2(size_t x);
size_t pocl_size_ceil2 (size_t x);
uint64_t pocl_size_ceil2_64 (uint64_t x);

/* Allocates aligned blocks of memory.
*
@@ -179,4 +179,9 @@ kernel void test_printf()
printf("|%s|%4s|%-4s|%4s|%.4s|\n", "aa", "bb", "cc", "dddddddddd", "eeeeee");
printf("|%p|%12p|%-12p|\n", (void*)0x2349aacc, (void*)0xdeaddeed, (void*)0x92820384);

printf ("\nPARAMETER PASSING\n\n");

printf("%c %#v2hhx %#v2hhx %c\n", '*', (char2)(0xFA, 0xFB), (char2)(0xFC, 0xFD), '.');
printf("%c %#v2hx %#v2hx %c\n", '*', (short2)(0x1234, 0x8765), (short2)(0xBEEF, 0xF00D), '.');
printf("%c %#v2hlx %#v2hlx %c\n", '*', (int2)(0x12345678, 0x87654321), (int2)(0x2468ACE0, 0xFDB97531), '.');
}
@@ -151,4 +151,10 @@ VECTORS
|a| b|c |
|aa| bb|cc |dddddddddd|eeee|
|0x2349aacc| 0xdeaddeed|0x92820384 |

PARAMETER PASSING

* 0xfa,0xfb 0xfc,0xfd .
* 0x1234,0x8765 0xbeef,0xf00d .
* 0x12345678,0x87654321 0x2468ace0,0xfdb97531 .
OK

0 comments on commit d893992

Please sign in to comment.
You can’t perform that action at this time.