diff --git a/CMakeLists.txt b/CMakeLists.txt index c3733f1280..edbc68ee39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1325,7 +1325,8 @@ cl_exp_pinned_buffers") set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes __opencl_c_images \ __opencl_c_atomic_order_acq_rel __opencl_c_atomic_order_seq_cst \ __opencl_c_atomic_scope_device __opencl_c_program_scope_global_variables \ -__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space") +__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space \ +__opencl_c_work_group_collective_functions") # Host CPU device: extensions only enabled when conformance is OFF if(NOT ENABLE_CONFORMANCE) diff --git a/doc/sphinx/source/notes_6_0.rst b/doc/sphinx/source/notes_6_0.rst index ec0a37b8a3..27d21357ea 100644 --- a/doc/sphinx/source/notes_6_0.rst +++ b/doc/sphinx/source/notes_6_0.rst @@ -2,14 +2,6 @@ Release Notes for PoCL 6.0 ************************** - - -Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added. -As the extension specification states that these hints provide no guarantees of -any particular behavior (or lack thereof) they are treated as a no-op. However -specifying them no longer causes `clCreateCommandQueueWithProperties` to return -an error. - ============================ New device driver: cpu-tbb ============================ @@ -18,6 +10,16 @@ The cpu-tbb device driver uses the Intel oneAPI Threading Building Blocks (oneTB library for work-group and kernel-level task scheduling. Except for the task scheduler, the driver is identical to the original 'cpu' driver (pthread). +===================================== +Command queue priority/throttle hints +===================================== + +Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added. +As the extension specification states that these hints provide no guarantees of +any particular behavior (or lack thereof) they are treated as a no-op. However +specifying them no longer causes `clCreateCommandQueueWithProperties` to return +an error. + =========================== Driver-specific features =========================== @@ -26,9 +28,13 @@ Driver-specific features CPU driver ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The 'cpu' driver gained support for using OpenMP for thread scheduling. -Support is disabled by default, but can be enabled with CMake option. The -'cpu-minimal' driver does not support OpenMP. + * Support for using OpenMP for task scheduling was added. It is disabled + by default, but can be enabled with CMake option. The 'cpu-minimal' + driver does not support OpenMP since it's supposed to be single-threaded. + * The CPU drivers can be now used for running SYCL programs compiled with + the oneAPI binary distributions of DPC++ by adding the following environment + settings: **POCL_DRIVER_VERSION_OVERRIDE=2023.16.7.0.21_160000 POCL_CPU_VENDOR_ID_OVERRIDE=32902**. + * Added support for the **__opencl_c_work_group_collective_functions** feature. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Remote diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst index e54023e6b4..7e7ec6d7f8 100644 --- a/doc/sphinx/source/using.rst +++ b/doc/sphinx/source/using.rst @@ -160,6 +160,14 @@ pocl. 'cpu' device driver. The default is to determine this from the number of hardware threads available in the CPU. +- **POCL_CPU_VENDOR_ID_OVERRIDE** + + Overrides the vendor id reported by PoCL for the CPU drivers. + For example, setting the vendor id to be 32902 (0x8086) and setting the driver + version using **POCL_DRIVER_VER_OVERRIDE** to "2023.16.7.0.21_160000" (or such) can + be used to convince binary-distributed DPC++ compilers to compile and run SYCL + programs on the PoCL-CPU driver. + - **POCL_DEBUG** Enables debug messages to stderr. This will be mostly messages from error @@ -170,7 +178,8 @@ pocl. The old way (setting POCL_DEBUG to 1) has been updated to support categories. Using this limits the amount of debug messages produced. Current options are: - error,warning,general,memory,llvm,events,cache,locking,refcounts,timing,hsa,tce,cuda,vulkan,proxy,all. + 'error', 'warning', 'general', 'memory', 'llvm', 'events', 'cache', 'locking', + 'refcounts', 'timing', 'hsa', 'tce', 'cuda', 'vulkan', 'proxy' and 'all'. Note: setting POCL_DEBUG to 1 still works and equals error+warning+general. - **POCL_DEBUG_LLVM_PASSES** @@ -221,6 +230,11 @@ pocl. POCL_TTASIM0_PARAMETERS will be passed to the first ttasim driver instantiated and POCL_TTASIM1_PARAMETERS to the second one. +- **POCL_DRIVER_VERSION_OVERRIDE** + + Can be used to override the driver version reported by PoCL. + See **POCL_CPU_VENDOR_ID_OVERRIDE** for an example use case. + - **POCL_EXTRA_BUILD_FLAGS** Adds the contents of the environment variable to all clBuildProgram() calls. diff --git a/examples/boxadd/boxadd.c b/examples/boxadd/boxadd.c index a89074d611..38ca19924d 100644 --- a/examples/boxadd/boxadd.c +++ b/examples/boxadd/boxadd.c @@ -112,5 +112,9 @@ main (int argc, char **argv) CHECK_CL_ERROR (clReleaseContext (context)); CHECK_CL_ERROR (clUnloadPlatformCompiler (platform)); + free (srcA); + free (srcB); + free (dst); + return err; } diff --git a/examples/matadd/matadd.c b/examples/matadd/matadd.c index 2c54d597d1..09690ef5be 100644 --- a/examples/matadd/matadd.c +++ b/examples/matadd/matadd.c @@ -108,5 +108,9 @@ main (int argc, char **argv) CHECK_CL_ERROR (clReleaseContext (context)); CHECK_CL_ERROR (clUnloadPlatformCompiler (platform)); + free (srcA); + free (srcB); + free (dst); + return err; } diff --git a/include/pocl.h b/include/pocl.h index 1714b02fd0..ea2e2e5276 100644 --- a/include/pocl.h +++ b/include/pocl.h @@ -47,9 +47,11 @@ /* detects restrict, variadic macros etc */ #include "pocl_compiler_features.h" -/* The maximum file, directory and path name lengths. TODO: These should be - detected from the filesystem properties of the execution platform. */ -#define POCL_MAX_DIRNAME_LENGTH 255 +/* The maximum file, directory and path name lengths. + NOTE: GDB seems to fail to load symbols from .so files which have + longer pathnames than 511, thus the quite small dir/filename length + limiter. */ +#define POCL_MAX_DIRNAME_LENGTH 64 #define POCL_MAX_FILENAME_LENGTH (POCL_MAX_DIRNAME_LENGTH) #define POCL_MAX_PATHNAME_LENGTH 4096 diff --git a/lib/CL/clCreateProgramWithIL.c b/lib/CL/clCreateProgramWithIL.c index d6a7265026..90e3647834 100644 --- a/lib/CL/clCreateProgramWithIL.c +++ b/lib/CL/clCreateProgramWithIL.c @@ -128,16 +128,13 @@ CL_API_SUFFIX__VERSION_2_1 POCL_GOTO_ERROR_COND ((length == 0), CL_INVALID_VALUE); int is_spirv = 0; -#ifdef ENABLE_SPIRV int is_spirv_kernel = pocl_bitcode_is_spirv_execmodel_kernel ((const char *)il, length); is_spirv += is_spirv_kernel; -#endif -#ifdef ENABLE_VULKAN + int is_spirv_shader = pocl_bitcode_is_spirv_execmodel_shader ((const char *)il, length); is_spirv += is_spirv_shader; -#endif POCL_GOTO_ERROR_ON ( (!is_spirv), CL_INVALID_VALUE, diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c index 41e456b052..c3e49aa59c 100644 --- a/lib/CL/clGetDeviceInfo.c +++ b/lib/CL/clGetDeviceInfo.c @@ -61,14 +61,14 @@ POname(clGetDeviceInfo)(cl_device_id device, case CL_DEVICE_IMAGE_SUPPORT: POCL_RETURN_GETINFO(cl_bool, device->image_support); case CL_DEVICE_TYPE: - POCL_RETURN_GETINFO(cl_device_type, device->type); + POCL_RETURN_GETINFO (cl_device_type, device->type); case CL_DEVICE_VENDOR_ID: POCL_RETURN_GETINFO(cl_uint, device->vendor_id); case CL_DEVICE_MAX_COMPUTE_UNITS: POCL_RETURN_GETINFO(cl_uint, device->max_compute_units); case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : POCL_RETURN_GETINFO(cl_uint, device->max_work_item_dimensions); - case CL_DEVICE_MAX_WORK_GROUP_SIZE : + case CL_DEVICE_MAX_WORK_GROUP_SIZE: { size_t max_wg_size = device->max_work_group_size; POCL_RETURN_GETINFO(size_t, max_wg_size); @@ -342,7 +342,7 @@ POname(clGetDeviceInfo)(cl_device_id device, case CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT: POCL_RETURN_GETINFO (cl_bool, device->non_uniform_work_group_support); case CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT: - POCL_RETURN_GETINFO (cl_bool, CL_FALSE); + POCL_RETURN_GETINFO (cl_bool, device->wg_collective_func_support); case CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT: POCL_RETURN_GETINFO (cl_bool, device->generic_as_support); case CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES: diff --git a/lib/CL/devices/common.c b/lib/CL/devices/common.c index 40298250bd..66f54fed0d 100644 --- a/lib/CL/devices/common.c +++ b/lib/CL/devices/common.c @@ -1984,7 +1984,7 @@ pocl_setup_ils_with_version (cl_device_id dev) } } -static const cl_name_version OPENCL_FEATURES[] = { +static const cl_name_version OPENCL_C_FEATURES[] = { { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_3d_image_writes" }, { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_images" }, { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_read_write_images" }, @@ -2013,15 +2013,15 @@ static const cl_name_version OPENCL_FEATURES[] = { { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_ext_fp64_local_atomic_min_max" }, }; -const size_t OPENCL_FEATURES_NUM - = sizeof (OPENCL_FEATURES) / sizeof (OPENCL_FEATURES[0]); +const size_t OPENCL_C_FEATURES_NUM + = sizeof (OPENCL_C_FEATURES) / sizeof (OPENCL_C_FEATURES[0]); void pocl_setup_features_with_version (cl_device_id dev) { cl_name_version *tmp = NULL; unsigned ret = pocl_space_delim_string_to_cl_name_version_array ( - &tmp, dev->features, OPENCL_FEATURES, OPENCL_FEATURES_NUM); + &tmp, dev->features, OPENCL_C_FEATURES, OPENCL_C_FEATURES_NUM); dev->num_opencl_features_with_version = ret; dev->opencl_features_with_version = tmp; diff --git a/lib/CL/devices/common_utils.c b/lib/CL/devices/common_utils.c index 32551537b7..5e17acc5dd 100644 --- a/lib/CL/devices/common_utils.c +++ b/lib/CL/devices/common_utils.c @@ -105,9 +105,8 @@ align_ptr (char *p) #define FALLBACK_MAX_THREAD_COUNT 8 -/* initializes CPU-specific device info struct members, that cannot / should +/* Initializes CPU-specific device info default, that cannot / should not be initialized in pocl_init_default_device_infos() */ - cl_int pocl_cpu_init_common (cl_device_id device) { @@ -139,6 +138,7 @@ pocl_cpu_init_common (cl_device_id device) device->features = HOST_DEVICE_FEATURES_30; device->run_program_scope_variables_pass = CL_TRUE; device->generic_as_support = CL_TRUE; + device->wg_collective_func_support = CL_TRUE; pocl_setup_opencl_c_with_version (device, CL_TRUE); pocl_setup_features_with_version (device); diff --git a/lib/CL/devices/cpuinfo.c b/lib/CL/devices/cpuinfo.c index 4919915b34..ca6026f5f9 100644 --- a/lib/CL/devices/cpuinfo.c +++ b/lib/CL/devices/cpuinfo.c @@ -305,7 +305,8 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device) /* default vendor and vendor_id, in case it cannot be found by other means */ device->vendor = cpuvendor_default; if (device->vendor_id == 0) - device->vendor_id = CL_KHRONOS_VENDOR_ID_POCL; + device->vendor_id = pocl_get_int_option ("POCL_CPU_VENDOR_ID_OVERRIDE", + CL_KHRONOS_VENDOR_ID_POCL); /* read contents of /proc/cpuinfo */ if (access (cpuinfo, R_OK) != 0) @@ -313,7 +314,7 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device) FILE *f = fopen (cpuinfo, "r"); char contents[MAX_CPUINFO_SIZE]; - int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f); + int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f); fclose(f); contents[num_read]='\0'; diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c index ce227e8f14..ca7996f7f9 100644 --- a/lib/CL/devices/devices.c +++ b/lib/CL/devices/devices.c @@ -669,7 +669,9 @@ pocl_init_devices () a shared global memory. */ dev->global_mem_id = dev_index; POCL_INIT_OBJECT (dev); - dev->driver_version = POCL_VERSION_FULL; + dev->driver_version = pocl_get_string_option ( + "POCL_DRIVER_VERSION_OVERRIDE", POCL_VERSION_FULL); + if (dev->version == NULL) dev->version = "OpenCL 2.0 pocl"; diff --git a/lib/CL/devices/remote/remote.c b/lib/CL/devices/remote/remote.c index 6d89cc3364..a589eb7146 100644 --- a/lib/CL/devices/remote/remote.c +++ b/lib/CL/devices/remote/remote.c @@ -686,7 +686,7 @@ setup_relevant_devices (cl_program program, cl_device_id device, remote_server_data_t *server = ((remote_device_data_t *)device->data)->server; unsigned num_relevant_devices = 0; - char program_bc_path[POCL_MAX_FILENAME_LENGTH]; + char program_bc_path[POCL_MAX_PATHNAME_LENGTH]; unsigned i, j; for (i = 0; i < program->num_devices; ++i) diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c index ecf5070fd1..a620694a30 100644 --- a/lib/CL/pocl_cache.c +++ b/lib/CL/pocl_cache.c @@ -92,12 +92,11 @@ void pocl_cache_program_path(char* path, program_device_dir (path, program, device_i, ""); } -// required in llvm API void pocl_cache_program_bc_path(char* program_bc_path, cl_program program, unsigned device_i) { - program_device_dir(program_bc_path, program, - device_i, POCL_PROGRAM_BC_FILENAME); + program_device_dir (program_bc_path, program, + device_i, POCL_PROGRAM_BC_FILENAME); } void @@ -208,9 +207,9 @@ pocl_cache_kernel_cachedir (char *kernel_cachedir_path, cl_program program, { int bytes_written; char tempstring[POCL_MAX_PATHNAME_LENGTH]; - char file_name[POCL_MAX_DIRNAME_LENGTH + 1]; + char file_name[POCL_MAX_FILENAME_LENGTH + 1]; - pocl_hash_clipped_name (kernel_name, POCL_MAX_DIRNAME_LENGTH, &file_name[0]); + pocl_hash_clipped_name (kernel_name, POCL_MAX_FILENAME_LENGTH, &file_name[0]); bytes_written = snprintf (tempstring, POCL_MAX_PATHNAME_LENGTH, "/%s", file_name); diff --git a/lib/CL/pocl_cl.h b/lib/CL/pocl_cl.h index 8dd114868d..f0881f4501 100644 --- a/lib/CL/pocl_cl.h +++ b/lib/CL/pocl_cl.h @@ -853,6 +853,7 @@ struct _cl_device_id { size_t preferred_wg_size_multiple; cl_bool non_uniform_work_group_support; cl_bool generic_as_support; + cl_bool wg_collective_func_support; cl_uint preferred_vector_width_char; cl_uint preferred_vector_width_short; cl_uint preferred_vector_width_int; diff --git a/lib/kernel/host/CMakeLists.txt b/lib/kernel/host/CMakeLists.txt index 77d34b479b..27b3d3959b 100644 --- a/lib/kernel/host/CMakeLists.txt +++ b/lib/kernel/host/CMakeLists.txt @@ -139,6 +139,7 @@ vload_store_half_f16c.c vstore.cl vstore_half.cl wait_group_events.cl +work_group.c write_image.cl ################################################################### diff --git a/lib/kernel/subgroups.c b/lib/kernel/subgroups.c index be48fc59e3..a836f92ab9 100644 --- a/lib/kernel/subgroups.c +++ b/lib/kernel/subgroups.c @@ -29,37 +29,11 @@ #include -/** - * \brief Internal pseudo function which allocates space from the work-group - * thread's stack (basically local memory) for each work-item. - * - * It's expanded in WorkitemLoops.cc to an alloca(). - * - * @param element_size The size of an element to allocate (for all WIs in the - * WG). - * @param align The alignment of the start of chunk. - * @param extra_bytes extra bytes to add to the allocation, some functions need - * extra space - * @return pointer to the allocated stack space (freed at unwind). - */ -void *__pocl_work_group_alloca (size_t element_size, size_t align, - size_t extra_bytes); - -/** - * \brief Internal pseudo function which allocates space from the work-group - * thread's stack (basically local memory). - * - * It's expanded in WorkitemLoops.cc to an alloca(). - * - * @param bytes The size of data to allocate in bytes. - * @param align The alignment of the start of chunk. - * @return pointer to the allocated stack space (freed at unwind). - */ -void *__pocl_local_mem_alloca (size_t bytes, size_t align); - -size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); +#include "work_group_alloca.h" size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx); +size_t _CL_OVERLOADABLE get_local_linear_id (void); +size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); /* Magic variable that is expanded in Workgroup.cc */ extern uint _pocl_sub_group_size; @@ -89,8 +63,6 @@ get_enqueued_num_sub_groups (void) return 1; } -size_t _CL_OVERLOADABLE get_local_linear_id (void); - uint _CL_OVERLOADABLE get_sub_group_id (void) { diff --git a/lib/kernel/work_group.c b/lib/kernel/work_group.c new file mode 100644 index 0000000000..c68f013360 --- /dev/null +++ b/lib/kernel/work_group.c @@ -0,0 +1,228 @@ +/* OpenCL built-in library: work-group collective functions + + Copyright (c) 2024 Pekka Jääskeläinen / Intel Finland Oy + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + IN THE SOFTWARE. +*/ + +#include "work_group_alloca.h" +#include +#include + +size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx); +size_t _CL_OVERLOADABLE get_local_linear_id (void); +size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); +void _CL_OVERLOADABLE + POCL_BUILTIN_PREFIX (work_group_barrier) (cl_mem_fence_flags flags); + +#define work_group_barrier POCL_BUILTIN_PREFIX (work_group_barrier) + +/* Align the stack temporary data by this multiple to facilitate easier + vectorization. */ +#define ALIGN_ELEMENT_MULTIPLE 32 + +static size_t +get_total_local_size () +{ + return get_local_size (0) * get_local_size (1) * get_local_size (2); +} + +#define WORK_GROUP_SHUFFLE_PT(PREFIX, TYPE) \ + __attribute__ ((always_inline)) static TYPE _CL_OVERLOADABLE \ + PREFIX##work_group_shuffle (TYPE val, size_t id) \ + { \ + volatile TYPE *temp_storage = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + temp_storage[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return temp_storage[id % get_total_local_size ()]; \ + } + +/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */ +#define WORK_GROUP_SHUFFLE_T(TYPE) WORK_GROUP_SHUFFLE_PT (, TYPE) + +WORK_GROUP_SHUFFLE_T (char) +WORK_GROUP_SHUFFLE_T (uchar) +WORK_GROUP_SHUFFLE_T (short) +WORK_GROUP_SHUFFLE_T (ushort) +WORK_GROUP_SHUFFLE_T (int) +WORK_GROUP_SHUFFLE_T (uint) +WORK_GROUP_SHUFFLE_T (long) +WORK_GROUP_SHUFFLE_T (ulong) +WORK_GROUP_SHUFFLE_T (float) +WORK_GROUP_SHUFFLE_T (double) + +#define WORK_GROUP_BROADCAST_T(TYPE) \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x) \ + { \ + return work_group_shuffle (val, x); \ + } \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x, size_t y) \ + { \ + return work_group_shuffle (val, y * get_local_size (0) + x); \ + } \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x, size_t y, size_t z) \ + { \ + return work_group_shuffle (val, \ + z * get_local_size (1) * get_local_size (0) \ + + y * get_local_size (0) + x); \ + } + +WORK_GROUP_BROADCAST_T (int) +WORK_GROUP_BROADCAST_T (uint) +WORK_GROUP_BROADCAST_T (long) +WORK_GROUP_BROADCAST_T (ulong) +WORK_GROUP_BROADCAST_T (float) +WORK_GROUP_BROADCAST_T (double) + +#define WORK_GROUP_REDUCE_OT(OPNAME, OPERATION, TYPE) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_reduce_##OPNAME (TYPE val) \ + { \ + volatile TYPE *temp_storage = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + temp_storage[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = temp_storage[0], b = temp_storage[i]; \ + temp_storage[0] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return temp_storage[0]; \ + } + +#define WORK_GROUP_REDUCE_T(OPNAME, OPERATION) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, int) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, uint) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, long) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, ulong) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, float) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, double) + +WORK_GROUP_REDUCE_T (add, a + b) +WORK_GROUP_REDUCE_T (min, a > b ? b : a) +WORK_GROUP_REDUCE_T (max, a > b ? a : b) + +#define WORK_GROUP_SCAN_INCLUSIVE_OT(OPNAME, OPERATION, TYPE) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_scan_inclusive_##OPNAME (TYPE val) \ + { \ + volatile TYPE *data = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + data[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = data[i - 1], b = data[i]; \ + data[i] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return data[get_local_linear_id ()]; \ + } + +#define WORK_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, int) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, uint) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, long) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, ulong) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, float) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, double) + +WORK_GROUP_SCAN_INCLUSIVE_T (add, a + b) +WORK_GROUP_SCAN_INCLUSIVE_T (min, a > b ? b : a) +WORK_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b) + +#define WORK_GROUP_SCAN_EXCLUSIVE_OT(OPNAME, OPERATION, TYPE, ID) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_scan_exclusive_##OPNAME (TYPE val) \ + { \ + volatile TYPE *data = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), \ + sizeof (TYPE)); \ + data[get_local_linear_id () + 1] = val; \ + data[0] = ID; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = data[i - 1], b = data[i]; \ + data[i] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return data[get_local_linear_id ()]; \ + } + +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, uint, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, long, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, ulong, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, float, 0.0f) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, double, 0.0) + +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, int, INT_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, uint, UINT_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, long, LONG_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, ulong, ULONG_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, float, +INFINITY) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, double, +INFINITY) + +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, int, INT_MIN) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, uint, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, long, LONG_MIN) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, ulong, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, float, -INFINITY) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY) + +__attribute__ ((always_inline)) int _CL_OVERLOADABLE +work_group_any (int predicate) +{ + /* The results for all of the WIs. */ + int *flags = __pocl_work_group_alloca ( + sizeof (int), ALIGN_ELEMENT_MULTIPLE * sizeof (int), 0); + /* The final result. */ + flags[get_local_linear_id ()] = !!predicate; + int *result = __pocl_work_group_alloca (sizeof (int), sizeof (int), 0); + work_group_barrier (CLK_LOCAL_MEM_FENCE); + if (get_local_linear_id () == 0) + { + *result = 0; + for (uint i = 0; i < get_total_local_size (); ++i) + *result |= flags[i]; + } + work_group_barrier (CLK_LOCAL_MEM_FENCE); + return *result; +} + +__attribute__ ((always_inline)) int _CL_OVERLOADABLE +work_group_all (int predicate) +{ + return !work_group_any (!!predicate); +} diff --git a/lib/kernel/work_group_alloca.h b/lib/kernel/work_group_alloca.h new file mode 100644 index 0000000000..1cff7e5cb8 --- /dev/null +++ b/lib/kernel/work_group_alloca.h @@ -0,0 +1,50 @@ +/* OpenCL built-in library: internal work group memory allocation functionality + + Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + IN THE SOFTWARE. +*/ + +/** + * \brief Internal pseudo function which allocates space from the work-group + * thread's stack (basically local memory) for each work-item. + * + * It's expanded in WorkitemLoops.cc to an alloca(). + * + * @param element_size The size of an element to allocate (for all WIs in the + * WG). + * @param align The alignment of the start of chunk. + * @param extra_bytes extra bytes to add to the allocation, some functions need + * extra space + * @return pointer to the allocated stack space (freed at unwind). + */ +void *__pocl_work_group_alloca (size_t element_size, size_t align, + size_t extra_bytes); + +/** + * \brief Internal pseudo function which allocates space from the work-group + * thread's stack (basically local memory). + * + * It's expanded in WorkitemLoops.cc to an alloca(). + * + * @param bytes The size of data to allocate in bytes. + * @param align The alignment of the start of chunk. + * @return pointer to the allocated stack space (freed at unwind). + */ +void *__pocl_local_mem_alloca (size_t bytes, size_t align); diff --git a/pocld/shared_cl_context.cc b/pocld/shared_cl_context.cc index 2e459fdf79..f5f7994cce 100644 --- a/pocld/shared_cl_context.cc +++ b/pocld/shared_cl_context.cc @@ -1260,9 +1260,12 @@ bool createSPIRVWithSVMOffset(const std::vector *InputSPV, LibPoCLPath /= std::filesystem::path(BUILDDIR) / "lib" / "CL" / "libpocl.so"; + // Without -strip-debug there might be crashes due to llvm-spirv + // not detecting its own produced debug output sometimes (to + // report). OptCmd << LLVM_OPT << " -load-pass-plugin=" << LibPoCLPath - << " -passes=svm-offset -svm-offset-value=" << SVMOffset << " " - << OrigBcFileName << " -o " << OffsettedBcFileName; + << " -strip-debug -passes=svm-offset -svm-offset-value=" << SVMOffset + << " " << OrigBcFileName << " -o " << OffsettedBcFileName; if (system(OptCmd.str().c_str()) != EXIT_SUCCESS) return false; diff --git a/tests/regression/test_llvm_segfault_issue_889.c b/tests/regression/test_llvm_segfault_issue_889.c index 7ed2c1b067..4d39cad962 100644 --- a/tests/regression/test_llvm_segfault_issue_889.c +++ b/tests/regression/test_llvm_segfault_issue_889.c @@ -114,7 +114,7 @@ main () printf ("binary size [%zd]: %zd\n", i, binsizes[i]); CHECK_CL_ERROR (clReleaseProgram (program)); - + CHECK_CL_ERROR (clReleaseCommandQueue (command_queue)); CHECK_CL_ERROR (clReleaseContext (context)); printf ("OK\n"); diff --git a/tests/workgroup/run_kernel.c b/tests/workgroup/run_kernel.c index 7a7da4a581..27f6bf6389 100644 --- a/tests/workgroup/run_kernel.c +++ b/tests/workgroup/run_kernel.c @@ -152,6 +152,9 @@ main (int argc, char **argv) if (context) clReleaseContext (context); + free (source); + free (devices); + if (err == CL_SUCCESS) { printf ("OK\n"); diff --git a/tools/scripts/format-diff.sh b/tools/scripts/format-diff.sh index 4fec5bd29f..f9e079559e 100755 --- a/tools/scripts/format-diff.sh +++ b/tools/scripts/format-diff.sh @@ -15,7 +15,7 @@ pushd ${GITROOT} > /dev/null PATCHY=$(mktemp /tmp/pocl.XXXXXXXX.patch) trap "rm -f $PATCHY" EXIT -git diff -U0 --no-color >$PATCHY +git diff $* -U0 --no-color >$PATCHY $RELPATH/clang-format-diff.py -regex '.*(\.h$|\.c$|\.cl$)' -i -p1 -style GNU <$PATCHY $RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.hh$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)|(lib/CL/devices/tce/.*)' -i -p1 -style LLVM <$PATCHY