From dc92bff334e3463b95c03485694be70a1afe92eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:15:09 +0200 Subject: [PATCH 1/8] Misc. non-functional cleanups ...around the functional changes. --- doc/sphinx/source/notes_6_0.rst | 18 ++++++------ doc/sphinx/source/using.rst | 3 +- lib/CL/clGetDeviceInfo.c | 4 +-- lib/CL/devices/common.c | 8 +++--- lib/CL/devices/common_utils.c | 3 +- lib/CL/devices/cpuinfo.c | 2 +- lib/CL/pocl_cache.c | 5 ++-- lib/kernel/subgroups.c | 34 ++-------------------- lib/kernel/work_group_alloca.h | 50 +++++++++++++++++++++++++++++++++ 9 files changed, 75 insertions(+), 52 deletions(-) create mode 100644 lib/kernel/work_group_alloca.h diff --git a/doc/sphinx/source/notes_6_0.rst b/doc/sphinx/source/notes_6_0.rst index ec0a37b8a3..6c1d63a10e 100644 --- a/doc/sphinx/source/notes_6_0.rst +++ b/doc/sphinx/source/notes_6_0.rst @@ -2,14 +2,6 @@ Release Notes for PoCL 6.0 ************************** - - -Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added. -As the extension specification states that these hints provide no guarantees of -any particular behavior (or lack thereof) they are treated as a no-op. However -specifying them no longer causes `clCreateCommandQueueWithProperties` to return -an error. - ============================ New device driver: cpu-tbb ============================ @@ -18,6 +10,16 @@ The cpu-tbb device driver uses the Intel oneAPI Threading Building Blocks (oneTB library for work-group and kernel-level task scheduling. Except for the task scheduler, the driver is identical to the original 'cpu' driver (pthread). +===================================== +Command queue priority/throttle hints +===================================== + +Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added. +As the extension specification states that these hints provide no guarantees of +any particular behavior (or lack thereof) they are treated as a no-op. However +specifying them no longer causes `clCreateCommandQueueWithProperties` to return +an error. + =========================== Driver-specific features =========================== diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst index e54023e6b4..44d7dd660f 100644 --- a/doc/sphinx/source/using.rst +++ b/doc/sphinx/source/using.rst @@ -170,7 +170,8 @@ pocl. The old way (setting POCL_DEBUG to 1) has been updated to support categories. Using this limits the amount of debug messages produced. Current options are: - error,warning,general,memory,llvm,events,cache,locking,refcounts,timing,hsa,tce,cuda,vulkan,proxy,all. + 'error', 'warning', 'general', 'memory', 'llvm', 'events', 'cache', 'locking', + 'refcounts', 'timing', 'hsa', 'tce', 'cuda', 'vulkan', 'proxy' and 'all'. Note: setting POCL_DEBUG to 1 still works and equals error+warning+general. - **POCL_DEBUG_LLVM_PASSES** diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c index 41e456b052..f2ecc82571 100644 --- a/lib/CL/clGetDeviceInfo.c +++ b/lib/CL/clGetDeviceInfo.c @@ -61,14 +61,14 @@ POname(clGetDeviceInfo)(cl_device_id device, case CL_DEVICE_IMAGE_SUPPORT: POCL_RETURN_GETINFO(cl_bool, device->image_support); case CL_DEVICE_TYPE: - POCL_RETURN_GETINFO(cl_device_type, device->type); + POCL_RETURN_GETINFO (cl_device_type, device->type); case CL_DEVICE_VENDOR_ID: POCL_RETURN_GETINFO(cl_uint, device->vendor_id); case CL_DEVICE_MAX_COMPUTE_UNITS: POCL_RETURN_GETINFO(cl_uint, device->max_compute_units); case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : POCL_RETURN_GETINFO(cl_uint, device->max_work_item_dimensions); - case CL_DEVICE_MAX_WORK_GROUP_SIZE : + case CL_DEVICE_MAX_WORK_GROUP_SIZE: { size_t max_wg_size = device->max_work_group_size; POCL_RETURN_GETINFO(size_t, max_wg_size); diff --git a/lib/CL/devices/common.c b/lib/CL/devices/common.c index 40298250bd..66f54fed0d 100644 --- a/lib/CL/devices/common.c +++ b/lib/CL/devices/common.c @@ -1984,7 +1984,7 @@ pocl_setup_ils_with_version (cl_device_id dev) } } -static const cl_name_version OPENCL_FEATURES[] = { +static const cl_name_version OPENCL_C_FEATURES[] = { { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_3d_image_writes" }, { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_images" }, { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_read_write_images" }, @@ -2013,15 +2013,15 @@ static const cl_name_version OPENCL_FEATURES[] = { { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_ext_fp64_local_atomic_min_max" }, }; -const size_t OPENCL_FEATURES_NUM - = sizeof (OPENCL_FEATURES) / sizeof (OPENCL_FEATURES[0]); +const size_t OPENCL_C_FEATURES_NUM + = sizeof (OPENCL_C_FEATURES) / sizeof (OPENCL_C_FEATURES[0]); void pocl_setup_features_with_version (cl_device_id dev) { cl_name_version *tmp = NULL; unsigned ret = pocl_space_delim_string_to_cl_name_version_array ( - &tmp, dev->features, OPENCL_FEATURES, OPENCL_FEATURES_NUM); + &tmp, dev->features, OPENCL_C_FEATURES, OPENCL_C_FEATURES_NUM); dev->num_opencl_features_with_version = ret; dev->opencl_features_with_version = tmp; diff --git a/lib/CL/devices/common_utils.c b/lib/CL/devices/common_utils.c index 32551537b7..0ba0e84c68 100644 --- a/lib/CL/devices/common_utils.c +++ b/lib/CL/devices/common_utils.c @@ -105,9 +105,8 @@ align_ptr (char *p) #define FALLBACK_MAX_THREAD_COUNT 8 -/* initializes CPU-specific device info struct members, that cannot / should +/* Initializes CPU-specific device info default, that cannot / should not be initialized in pocl_init_default_device_infos() */ - cl_int pocl_cpu_init_common (cl_device_id device) { diff --git a/lib/CL/devices/cpuinfo.c b/lib/CL/devices/cpuinfo.c index 4919915b34..d1b7a6082b 100644 --- a/lib/CL/devices/cpuinfo.c +++ b/lib/CL/devices/cpuinfo.c @@ -313,7 +313,7 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device) FILE *f = fopen (cpuinfo, "r"); char contents[MAX_CPUINFO_SIZE]; - int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f); + int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f); fclose(f); contents[num_read]='\0'; diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c index ecf5070fd1..13b3026aed 100644 --- a/lib/CL/pocl_cache.c +++ b/lib/CL/pocl_cache.c @@ -92,12 +92,11 @@ void pocl_cache_program_path(char* path, program_device_dir (path, program, device_i, ""); } -// required in llvm API void pocl_cache_program_bc_path(char* program_bc_path, cl_program program, unsigned device_i) { - program_device_dir(program_bc_path, program, - device_i, POCL_PROGRAM_BC_FILENAME); + program_device_dir (program_bc_path, program, + device_i, POCL_PROGRAM_BC_FILENAME); } void diff --git a/lib/kernel/subgroups.c b/lib/kernel/subgroups.c index be48fc59e3..a836f92ab9 100644 --- a/lib/kernel/subgroups.c +++ b/lib/kernel/subgroups.c @@ -29,37 +29,11 @@ #include -/** - * \brief Internal pseudo function which allocates space from the work-group - * thread's stack (basically local memory) for each work-item. - * - * It's expanded in WorkitemLoops.cc to an alloca(). - * - * @param element_size The size of an element to allocate (for all WIs in the - * WG). - * @param align The alignment of the start of chunk. - * @param extra_bytes extra bytes to add to the allocation, some functions need - * extra space - * @return pointer to the allocated stack space (freed at unwind). - */ -void *__pocl_work_group_alloca (size_t element_size, size_t align, - size_t extra_bytes); - -/** - * \brief Internal pseudo function which allocates space from the work-group - * thread's stack (basically local memory). - * - * It's expanded in WorkitemLoops.cc to an alloca(). - * - * @param bytes The size of data to allocate in bytes. - * @param align The alignment of the start of chunk. - * @return pointer to the allocated stack space (freed at unwind). - */ -void *__pocl_local_mem_alloca (size_t bytes, size_t align); - -size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); +#include "work_group_alloca.h" size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx); +size_t _CL_OVERLOADABLE get_local_linear_id (void); +size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); /* Magic variable that is expanded in Workgroup.cc */ extern uint _pocl_sub_group_size; @@ -89,8 +63,6 @@ get_enqueued_num_sub_groups (void) return 1; } -size_t _CL_OVERLOADABLE get_local_linear_id (void); - uint _CL_OVERLOADABLE get_sub_group_id (void) { diff --git a/lib/kernel/work_group_alloca.h b/lib/kernel/work_group_alloca.h new file mode 100644 index 0000000000..1cff7e5cb8 --- /dev/null +++ b/lib/kernel/work_group_alloca.h @@ -0,0 +1,50 @@ +/* OpenCL built-in library: internal work group memory allocation functionality + + Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + IN THE SOFTWARE. +*/ + +/** + * \brief Internal pseudo function which allocates space from the work-group + * thread's stack (basically local memory) for each work-item. + * + * It's expanded in WorkitemLoops.cc to an alloca(). + * + * @param element_size The size of an element to allocate (for all WIs in the + * WG). + * @param align The alignment of the start of chunk. + * @param extra_bytes extra bytes to add to the allocation, some functions need + * extra space + * @return pointer to the allocated stack space (freed at unwind). + */ +void *__pocl_work_group_alloca (size_t element_size, size_t align, + size_t extra_bytes); + +/** + * \brief Internal pseudo function which allocates space from the work-group + * thread's stack (basically local memory). + * + * It's expanded in WorkitemLoops.cc to an alloca(). + * + * @param bytes The size of data to allocate in bytes. + * @param align The alignment of the start of chunk. + * @return pointer to the allocated stack space (freed at unwind). + */ +void *__pocl_local_mem_alloca (size_t bytes, size_t align); From fb6056c2a4fe79183771d2f1c761a6461c945131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:18:23 +0200 Subject: [PATCH 2/8] Allow overriding the CPU driver vendor id and the driver version For example, setting the vendor id to be 32902 (0x8086) and setting the driver version using **POCL_DRIVER_VER_OVERRIDE** to "2023.16.7.0.21_160000" (or such) can be used to convince binary-distributed DPC++ compilers to compile and run SYCL programs on the PoCL-CPU driver. --- doc/sphinx/source/notes_6_0.rst | 9 ++++++--- doc/sphinx/source/using.rst | 13 +++++++++++++ lib/CL/devices/cpuinfo.c | 3 ++- lib/CL/devices/devices.c | 4 +++- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/sphinx/source/notes_6_0.rst b/doc/sphinx/source/notes_6_0.rst index 6c1d63a10e..097127bf1c 100644 --- a/doc/sphinx/source/notes_6_0.rst +++ b/doc/sphinx/source/notes_6_0.rst @@ -28,9 +28,12 @@ Driver-specific features CPU driver ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The 'cpu' driver gained support for using OpenMP for thread scheduling. -Support is disabled by default, but can be enabled with CMake option. The -'cpu-minimal' driver does not support OpenMP. + * Support for using OpenMP for task scheduling was added. It is disabled + by default, but can be enabled with CMake option. The 'cpu-minimal' + driver does not support OpenMP since it's supposed to be single-threaded. + * The CPU drivers can be now used for running SYCL programs compiled with + the oneAPI binary distributions of DPC++ by adding the following environment + settings: **POCL_DRIVER_VERSION_OVERRIDE=2023.16.7.0.21_160000 POCL_CPU_VENDOR_ID_OVERRIDE=32902**. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Remote diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst index 44d7dd660f..7e7ec6d7f8 100644 --- a/doc/sphinx/source/using.rst +++ b/doc/sphinx/source/using.rst @@ -160,6 +160,14 @@ pocl. 'cpu' device driver. The default is to determine this from the number of hardware threads available in the CPU. +- **POCL_CPU_VENDOR_ID_OVERRIDE** + + Overrides the vendor id reported by PoCL for the CPU drivers. + For example, setting the vendor id to be 32902 (0x8086) and setting the driver + version using **POCL_DRIVER_VER_OVERRIDE** to "2023.16.7.0.21_160000" (or such) can + be used to convince binary-distributed DPC++ compilers to compile and run SYCL + programs on the PoCL-CPU driver. + - **POCL_DEBUG** Enables debug messages to stderr. This will be mostly messages from error @@ -222,6 +230,11 @@ pocl. POCL_TTASIM0_PARAMETERS will be passed to the first ttasim driver instantiated and POCL_TTASIM1_PARAMETERS to the second one. +- **POCL_DRIVER_VERSION_OVERRIDE** + + Can be used to override the driver version reported by PoCL. + See **POCL_CPU_VENDOR_ID_OVERRIDE** for an example use case. + - **POCL_EXTRA_BUILD_FLAGS** Adds the contents of the environment variable to all clBuildProgram() calls. diff --git a/lib/CL/devices/cpuinfo.c b/lib/CL/devices/cpuinfo.c index d1b7a6082b..ca6026f5f9 100644 --- a/lib/CL/devices/cpuinfo.c +++ b/lib/CL/devices/cpuinfo.c @@ -305,7 +305,8 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device) /* default vendor and vendor_id, in case it cannot be found by other means */ device->vendor = cpuvendor_default; if (device->vendor_id == 0) - device->vendor_id = CL_KHRONOS_VENDOR_ID_POCL; + device->vendor_id = pocl_get_int_option ("POCL_CPU_VENDOR_ID_OVERRIDE", + CL_KHRONOS_VENDOR_ID_POCL); /* read contents of /proc/cpuinfo */ if (access (cpuinfo, R_OK) != 0) diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c index ce227e8f14..ca7996f7f9 100644 --- a/lib/CL/devices/devices.c +++ b/lib/CL/devices/devices.c @@ -669,7 +669,9 @@ pocl_init_devices () a shared global memory. */ dev->global_mem_id = dev_index; POCL_INIT_OBJECT (dev); - dev->driver_version = POCL_VERSION_FULL; + dev->driver_version = pocl_get_string_option ( + "POCL_DRIVER_VERSION_OVERRIDE", POCL_VERSION_FULL); + if (dev->version == NULL) dev->version = "OpenCL 2.0 pocl"; From 3435d6e9a2db4dec942a61f80c679b063965035d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:22:33 +0200 Subject: [PATCH 3/8] Limit the max dirname further to avoid probs. with gdb GDB seems to fail to load symbols from .so files which have longer pathnames than 511, thus made the directory length even shorter. Long directory names are produced a lot with C++/SYCL templated kernel/function names. --- include/pocl.h | 8 +++++--- lib/CL/devices/remote/remote.c | 2 +- lib/CL/pocl_cache.c | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/pocl.h b/include/pocl.h index 1714b02fd0..ea2e2e5276 100644 --- a/include/pocl.h +++ b/include/pocl.h @@ -47,9 +47,11 @@ /* detects restrict, variadic macros etc */ #include "pocl_compiler_features.h" -/* The maximum file, directory and path name lengths. TODO: These should be - detected from the filesystem properties of the execution platform. */ -#define POCL_MAX_DIRNAME_LENGTH 255 +/* The maximum file, directory and path name lengths. + NOTE: GDB seems to fail to load symbols from .so files which have + longer pathnames than 511, thus the quite small dir/filename length + limiter. */ +#define POCL_MAX_DIRNAME_LENGTH 64 #define POCL_MAX_FILENAME_LENGTH (POCL_MAX_DIRNAME_LENGTH) #define POCL_MAX_PATHNAME_LENGTH 4096 diff --git a/lib/CL/devices/remote/remote.c b/lib/CL/devices/remote/remote.c index 6d89cc3364..a589eb7146 100644 --- a/lib/CL/devices/remote/remote.c +++ b/lib/CL/devices/remote/remote.c @@ -686,7 +686,7 @@ setup_relevant_devices (cl_program program, cl_device_id device, remote_server_data_t *server = ((remote_device_data_t *)device->data)->server; unsigned num_relevant_devices = 0; - char program_bc_path[POCL_MAX_FILENAME_LENGTH]; + char program_bc_path[POCL_MAX_PATHNAME_LENGTH]; unsigned i, j; for (i = 0; i < program->num_devices; ++i) diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c index 13b3026aed..a620694a30 100644 --- a/lib/CL/pocl_cache.c +++ b/lib/CL/pocl_cache.c @@ -207,9 +207,9 @@ pocl_cache_kernel_cachedir (char *kernel_cachedir_path, cl_program program, { int bytes_written; char tempstring[POCL_MAX_PATHNAME_LENGTH]; - char file_name[POCL_MAX_DIRNAME_LENGTH + 1]; + char file_name[POCL_MAX_FILENAME_LENGTH + 1]; - pocl_hash_clipped_name (kernel_name, POCL_MAX_DIRNAME_LENGTH, &file_name[0]); + pocl_hash_clipped_name (kernel_name, POCL_MAX_FILENAME_LENGTH, &file_name[0]); bytes_written = snprintf (tempstring, POCL_MAX_PATHNAME_LENGTH, "/%s", file_name); From 575ba8795fa807c2637329d3dee99a4e47917534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:24:05 +0200 Subject: [PATCH 4/8] Detect SPIR-Vs even if client doesn't have SPIR-V setup Since the detection doesn't require llvm-spirv and the remote can support compiling from SPIR-Vs, we should not enable the runtime check only when the client has LLVM-SPIRV installed and enaled. This allows offloading SPIR-Vs from remote-only client builds. --- lib/CL/clCreateProgramWithIL.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/CL/clCreateProgramWithIL.c b/lib/CL/clCreateProgramWithIL.c index d6a7265026..90e3647834 100644 --- a/lib/CL/clCreateProgramWithIL.c +++ b/lib/CL/clCreateProgramWithIL.c @@ -128,16 +128,13 @@ CL_API_SUFFIX__VERSION_2_1 POCL_GOTO_ERROR_COND ((length == 0), CL_INVALID_VALUE); int is_spirv = 0; -#ifdef ENABLE_SPIRV int is_spirv_kernel = pocl_bitcode_is_spirv_execmodel_kernel ((const char *)il, length); is_spirv += is_spirv_kernel; -#endif -#ifdef ENABLE_VULKAN + int is_spirv_shader = pocl_bitcode_is_spirv_execmodel_shader ((const char *)il, length); is_spirv += is_spirv_shader; -#endif POCL_GOTO_ERROR_ON ( (!is_spirv), CL_INVALID_VALUE, From 63b03cb7da9c3f3c9480752e00fe15f304f0a105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:25:51 +0200 Subject: [PATCH 5/8] PoCL-D: SVM offsetting run to strip debug symbols Seems llvm-spirv fails to recompile its own output if the result contained unknown debug MD (likely produced with newer LLVM-SPIRV/LLVM). Strip the debug symbols off for now to circumvent the problem. --- pocld/shared_cl_context.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pocld/shared_cl_context.cc b/pocld/shared_cl_context.cc index 2e459fdf79..f5f7994cce 100644 --- a/pocld/shared_cl_context.cc +++ b/pocld/shared_cl_context.cc @@ -1260,9 +1260,12 @@ bool createSPIRVWithSVMOffset(const std::vector *InputSPV, LibPoCLPath /= std::filesystem::path(BUILDDIR) / "lib" / "CL" / "libpocl.so"; + // Without -strip-debug there might be crashes due to llvm-spirv + // not detecting its own produced debug output sometimes (to + // report). OptCmd << LLVM_OPT << " -load-pass-plugin=" << LibPoCLPath - << " -passes=svm-offset -svm-offset-value=" << SVMOffset << " " - << OrigBcFileName << " -o " << OffsettedBcFileName; + << " -strip-debug -passes=svm-offset -svm-offset-value=" << SVMOffset + << " " << OrigBcFileName << " -o " << OffsettedBcFileName; if (system(OptCmd.str().c_str()) != EXIT_SUCCESS) return false; From acd2c902dd77c0e7617d40f6afe8c62c8ed1e369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:33:09 +0200 Subject: [PATCH 6/8] CPU: Implement __opencl_c_work_group_collective_functions --- CMakeLists.txt | 3 +- doc/sphinx/source/notes_6_0.rst | 1 + lib/CL/clGetDeviceInfo.c | 2 +- lib/CL/devices/common_utils.c | 1 + lib/CL/pocl_cl.h | 1 + lib/kernel/host/CMakeLists.txt | 1 + lib/kernel/work_group.c | 228 ++++++++++++++++++++++++++++++++ 7 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 lib/kernel/work_group.c diff --git a/CMakeLists.txt b/CMakeLists.txt index c3733f1280..edbc68ee39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1325,7 +1325,8 @@ cl_exp_pinned_buffers") set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes __opencl_c_images \ __opencl_c_atomic_order_acq_rel __opencl_c_atomic_order_seq_cst \ __opencl_c_atomic_scope_device __opencl_c_program_scope_global_variables \ -__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space") +__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space \ +__opencl_c_work_group_collective_functions") # Host CPU device: extensions only enabled when conformance is OFF if(NOT ENABLE_CONFORMANCE) diff --git a/doc/sphinx/source/notes_6_0.rst b/doc/sphinx/source/notes_6_0.rst index 097127bf1c..27d21357ea 100644 --- a/doc/sphinx/source/notes_6_0.rst +++ b/doc/sphinx/source/notes_6_0.rst @@ -34,6 +34,7 @@ CPU driver * The CPU drivers can be now used for running SYCL programs compiled with the oneAPI binary distributions of DPC++ by adding the following environment settings: **POCL_DRIVER_VERSION_OVERRIDE=2023.16.7.0.21_160000 POCL_CPU_VENDOR_ID_OVERRIDE=32902**. + * Added support for the **__opencl_c_work_group_collective_functions** feature. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Remote diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c index f2ecc82571..c3e49aa59c 100644 --- a/lib/CL/clGetDeviceInfo.c +++ b/lib/CL/clGetDeviceInfo.c @@ -342,7 +342,7 @@ POname(clGetDeviceInfo)(cl_device_id device, case CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT: POCL_RETURN_GETINFO (cl_bool, device->non_uniform_work_group_support); case CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT: - POCL_RETURN_GETINFO (cl_bool, CL_FALSE); + POCL_RETURN_GETINFO (cl_bool, device->wg_collective_func_support); case CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT: POCL_RETURN_GETINFO (cl_bool, device->generic_as_support); case CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES: diff --git a/lib/CL/devices/common_utils.c b/lib/CL/devices/common_utils.c index 0ba0e84c68..5e17acc5dd 100644 --- a/lib/CL/devices/common_utils.c +++ b/lib/CL/devices/common_utils.c @@ -138,6 +138,7 @@ pocl_cpu_init_common (cl_device_id device) device->features = HOST_DEVICE_FEATURES_30; device->run_program_scope_variables_pass = CL_TRUE; device->generic_as_support = CL_TRUE; + device->wg_collective_func_support = CL_TRUE; pocl_setup_opencl_c_with_version (device, CL_TRUE); pocl_setup_features_with_version (device); diff --git a/lib/CL/pocl_cl.h b/lib/CL/pocl_cl.h index 8dd114868d..f0881f4501 100644 --- a/lib/CL/pocl_cl.h +++ b/lib/CL/pocl_cl.h @@ -853,6 +853,7 @@ struct _cl_device_id { size_t preferred_wg_size_multiple; cl_bool non_uniform_work_group_support; cl_bool generic_as_support; + cl_bool wg_collective_func_support; cl_uint preferred_vector_width_char; cl_uint preferred_vector_width_short; cl_uint preferred_vector_width_int; diff --git a/lib/kernel/host/CMakeLists.txt b/lib/kernel/host/CMakeLists.txt index 77d34b479b..27b3d3959b 100644 --- a/lib/kernel/host/CMakeLists.txt +++ b/lib/kernel/host/CMakeLists.txt @@ -139,6 +139,7 @@ vload_store_half_f16c.c vstore.cl vstore_half.cl wait_group_events.cl +work_group.c write_image.cl ################################################################### diff --git a/lib/kernel/work_group.c b/lib/kernel/work_group.c new file mode 100644 index 0000000000..c68f013360 --- /dev/null +++ b/lib/kernel/work_group.c @@ -0,0 +1,228 @@ +/* OpenCL built-in library: work-group collective functions + + Copyright (c) 2024 Pekka Jääskeläinen / Intel Finland Oy + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + IN THE SOFTWARE. +*/ + +#include "work_group_alloca.h" +#include +#include + +size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx); +size_t _CL_OVERLOADABLE get_local_linear_id (void); +size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx); +void _CL_OVERLOADABLE + POCL_BUILTIN_PREFIX (work_group_barrier) (cl_mem_fence_flags flags); + +#define work_group_barrier POCL_BUILTIN_PREFIX (work_group_barrier) + +/* Align the stack temporary data by this multiple to facilitate easier + vectorization. */ +#define ALIGN_ELEMENT_MULTIPLE 32 + +static size_t +get_total_local_size () +{ + return get_local_size (0) * get_local_size (1) * get_local_size (2); +} + +#define WORK_GROUP_SHUFFLE_PT(PREFIX, TYPE) \ + __attribute__ ((always_inline)) static TYPE _CL_OVERLOADABLE \ + PREFIX##work_group_shuffle (TYPE val, size_t id) \ + { \ + volatile TYPE *temp_storage = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + temp_storage[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return temp_storage[id % get_total_local_size ()]; \ + } + +/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */ +#define WORK_GROUP_SHUFFLE_T(TYPE) WORK_GROUP_SHUFFLE_PT (, TYPE) + +WORK_GROUP_SHUFFLE_T (char) +WORK_GROUP_SHUFFLE_T (uchar) +WORK_GROUP_SHUFFLE_T (short) +WORK_GROUP_SHUFFLE_T (ushort) +WORK_GROUP_SHUFFLE_T (int) +WORK_GROUP_SHUFFLE_T (uint) +WORK_GROUP_SHUFFLE_T (long) +WORK_GROUP_SHUFFLE_T (ulong) +WORK_GROUP_SHUFFLE_T (float) +WORK_GROUP_SHUFFLE_T (double) + +#define WORK_GROUP_BROADCAST_T(TYPE) \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x) \ + { \ + return work_group_shuffle (val, x); \ + } \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x, size_t y) \ + { \ + return work_group_shuffle (val, y * get_local_size (0) + x); \ + } \ + __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE \ + work_group_broadcast (TYPE val, size_t x, size_t y, size_t z) \ + { \ + return work_group_shuffle (val, \ + z * get_local_size (1) * get_local_size (0) \ + + y * get_local_size (0) + x); \ + } + +WORK_GROUP_BROADCAST_T (int) +WORK_GROUP_BROADCAST_T (uint) +WORK_GROUP_BROADCAST_T (long) +WORK_GROUP_BROADCAST_T (ulong) +WORK_GROUP_BROADCAST_T (float) +WORK_GROUP_BROADCAST_T (double) + +#define WORK_GROUP_REDUCE_OT(OPNAME, OPERATION, TYPE) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_reduce_##OPNAME (TYPE val) \ + { \ + volatile TYPE *temp_storage = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + temp_storage[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = temp_storage[0], b = temp_storage[i]; \ + temp_storage[0] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return temp_storage[0]; \ + } + +#define WORK_GROUP_REDUCE_T(OPNAME, OPERATION) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, int) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, uint) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, long) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, ulong) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, float) \ + WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, double) + +WORK_GROUP_REDUCE_T (add, a + b) +WORK_GROUP_REDUCE_T (min, a > b ? b : a) +WORK_GROUP_REDUCE_T (max, a > b ? a : b) + +#define WORK_GROUP_SCAN_INCLUSIVE_OT(OPNAME, OPERATION, TYPE) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_scan_inclusive_##OPNAME (TYPE val) \ + { \ + volatile TYPE *data = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0); \ + data[get_local_linear_id ()] = val; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = data[i - 1], b = data[i]; \ + data[i] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return data[get_local_linear_id ()]; \ + } + +#define WORK_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, int) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, uint) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, long) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, ulong) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, float) \ + WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, double) + +WORK_GROUP_SCAN_INCLUSIVE_T (add, a + b) +WORK_GROUP_SCAN_INCLUSIVE_T (min, a > b ? b : a) +WORK_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b) + +#define WORK_GROUP_SCAN_EXCLUSIVE_OT(OPNAME, OPERATION, TYPE, ID) \ + __attribute__ ((always_inline)) \ + TYPE _CL_OVERLOADABLE work_group_scan_exclusive_##OPNAME (TYPE val) \ + { \ + volatile TYPE *data = __pocl_work_group_alloca ( \ + sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), \ + sizeof (TYPE)); \ + data[get_local_linear_id () + 1] = val; \ + data[0] = ID; \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + if (get_local_linear_id () == 0) \ + { \ + for (uint i = 1; i < get_total_local_size (); ++i) \ + { \ + TYPE a = data[i - 1], b = data[i]; \ + data[i] = OPERATION; \ + } \ + } \ + work_group_barrier (CLK_LOCAL_MEM_FENCE); \ + return data[get_local_linear_id ()]; \ + } + +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, uint, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, long, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, ulong, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, float, 0.0f) +WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, double, 0.0) + +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, int, INT_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, uint, UINT_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, long, LONG_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, ulong, ULONG_MAX) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, float, +INFINITY) +WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, double, +INFINITY) + +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, int, INT_MIN) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, uint, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, long, LONG_MIN) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, ulong, 0) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, float, -INFINITY) +WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY) + +__attribute__ ((always_inline)) int _CL_OVERLOADABLE +work_group_any (int predicate) +{ + /* The results for all of the WIs. */ + int *flags = __pocl_work_group_alloca ( + sizeof (int), ALIGN_ELEMENT_MULTIPLE * sizeof (int), 0); + /* The final result. */ + flags[get_local_linear_id ()] = !!predicate; + int *result = __pocl_work_group_alloca (sizeof (int), sizeof (int), 0); + work_group_barrier (CLK_LOCAL_MEM_FENCE); + if (get_local_linear_id () == 0) + { + *result = 0; + for (uint i = 0; i < get_total_local_size (); ++i) + *result |= flags[i]; + } + work_group_barrier (CLK_LOCAL_MEM_FENCE); + return *result; +} + +__attribute__ ((always_inline)) int _CL_OVERLOADABLE +work_group_all (int predicate) +{ + return !work_group_any (!!predicate); +} From 74f0357041666b9fcd3a39e9b4d27ae287b62eb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 11:33:57 +0200 Subject: [PATCH 7/8] format-diff.sh: allow passing parameters to git diff Passing '--staged' is useful for formatting newly added files. --- tools/scripts/format-diff.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/scripts/format-diff.sh b/tools/scripts/format-diff.sh index 4fec5bd29f..f9e079559e 100755 --- a/tools/scripts/format-diff.sh +++ b/tools/scripts/format-diff.sh @@ -15,7 +15,7 @@ pushd ${GITROOT} > /dev/null PATCHY=$(mktemp /tmp/pocl.XXXXXXXX.patch) trap "rm -f $PATCHY" EXIT -git diff -U0 --no-color >$PATCHY +git diff $* -U0 --no-color >$PATCHY $RELPATH/clang-format-diff.py -regex '.*(\.h$|\.c$|\.cl$)' -i -p1 -style GNU <$PATCHY $RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.hh$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)|(lib/CL/devices/tce/.*)' -i -p1 -style LLVM <$PATCHY From e174847ba30eed08a370df46adb503e00698c781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?= Date: Thu, 14 Mar 2024 15:26:47 +0200 Subject: [PATCH 8/8] A couple of memleak fixes --- examples/boxadd/boxadd.c | 4 ++++ examples/matadd/matadd.c | 4 ++++ tests/regression/test_llvm_segfault_issue_889.c | 2 +- tests/workgroup/run_kernel.c | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/boxadd/boxadd.c b/examples/boxadd/boxadd.c index a89074d611..38ca19924d 100644 --- a/examples/boxadd/boxadd.c +++ b/examples/boxadd/boxadd.c @@ -112,5 +112,9 @@ main (int argc, char **argv) CHECK_CL_ERROR (clReleaseContext (context)); CHECK_CL_ERROR (clUnloadPlatformCompiler (platform)); + free (srcA); + free (srcB); + free (dst); + return err; } diff --git a/examples/matadd/matadd.c b/examples/matadd/matadd.c index 2c54d597d1..09690ef5be 100644 --- a/examples/matadd/matadd.c +++ b/examples/matadd/matadd.c @@ -108,5 +108,9 @@ main (int argc, char **argv) CHECK_CL_ERROR (clReleaseContext (context)); CHECK_CL_ERROR (clUnloadPlatformCompiler (platform)); + free (srcA); + free (srcB); + free (dst); + return err; } diff --git a/tests/regression/test_llvm_segfault_issue_889.c b/tests/regression/test_llvm_segfault_issue_889.c index 7ed2c1b067..4d39cad962 100644 --- a/tests/regression/test_llvm_segfault_issue_889.c +++ b/tests/regression/test_llvm_segfault_issue_889.c @@ -114,7 +114,7 @@ main () printf ("binary size [%zd]: %zd\n", i, binsizes[i]); CHECK_CL_ERROR (clReleaseProgram (program)); - + CHECK_CL_ERROR (clReleaseCommandQueue (command_queue)); CHECK_CL_ERROR (clReleaseContext (context)); printf ("OK\n"); diff --git a/tests/workgroup/run_kernel.c b/tests/workgroup/run_kernel.c index 7a7da4a581..27f6bf6389 100644 --- a/tests/workgroup/run_kernel.c +++ b/tests/workgroup/run_kernel.c @@ -152,6 +152,9 @@ main (int argc, char **argv) if (context) clReleaseContext (context); + free (source); + free (devices); + if (err == CL_SUCCESS) { printf ("OK\n");