diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3733f1280..edbc68ee39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1325,7 +1325,8 @@ cl_exp_pinned_buffers")
 set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes  __opencl_c_images \
 __opencl_c_atomic_order_acq_rel __opencl_c_atomic_order_seq_cst \
 __opencl_c_atomic_scope_device __opencl_c_program_scope_global_variables \
-__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space")
+__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space \
+__opencl_c_work_group_collective_functions")
 
 # Host CPU device: extensions only enabled when conformance is OFF
 if(NOT ENABLE_CONFORMANCE)
diff --git a/doc/sphinx/source/notes_6_0.rst b/doc/sphinx/source/notes_6_0.rst
index ec0a37b8a3..27d21357ea 100644
--- a/doc/sphinx/source/notes_6_0.rst
+++ b/doc/sphinx/source/notes_6_0.rst
@@ -2,14 +2,6 @@
 Release Notes for PoCL 6.0
 **************************
 
-
-
-Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added.
-As the extension specification states that these hints provide no guarantees of
-any particular behavior (or lack thereof) they are treated as a no-op. However
-specifying them no longer causes `clCreateCommandQueueWithProperties` to return
-an error.
-
 ============================
 New device driver: cpu-tbb
 ============================
@@ -18,6 +10,16 @@ The cpu-tbb device driver uses the Intel oneAPI Threading Building Blocks (oneTB
 library for work-group and kernel-level task scheduling. Except for the
 task scheduler, the driver is identical to the original 'cpu' driver (pthread).
 
+=====================================
+Command queue priority/throttle hints
+=====================================
+
+Minimal support for `cl_khr_priority_hints` and `cl_khr_throttle_hints` has been added.
+As the extension specification states that these hints provide no guarantees of
+any particular behavior (or lack thereof) they are treated as a no-op. However
+specifying them no longer causes `clCreateCommandQueueWithProperties` to return
+an error.
+
 ===========================
 Driver-specific features
 ===========================
@@ -26,9 +28,13 @@ Driver-specific features
 CPU driver
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The 'cpu' driver gained support for using OpenMP for thread scheduling.
-Support is disabled by default, but can be enabled with CMake option. The
-'cpu-minimal' driver does not support OpenMP.
+ * Support for using OpenMP for task scheduling was added. It is disabled
+   by default, but can be enabled with CMake option. The 'cpu-minimal'
+   driver does not support OpenMP since it's supposed to be single-threaded.
+ * The CPU drivers can be now used for running SYCL programs compiled with
+   the oneAPI binary distributions of DPC++ by adding the following environment
+   settings: **POCL_DRIVER_VERSION_OVERRIDE=2023.16.7.0.21_160000 POCL_CPU_VENDOR_ID_OVERRIDE=32902**.
+ * Added support for the **__opencl_c_work_group_collective_functions** feature.
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Remote
diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst
index e54023e6b4..7e7ec6d7f8 100644
--- a/doc/sphinx/source/using.rst
+++ b/doc/sphinx/source/using.rst
@@ -160,6 +160,14 @@ pocl.
  'cpu' device driver. The default is to determine this from the number of
  hardware threads available in the CPU.
 
+- **POCL_CPU_VENDOR_ID_OVERRIDE**
+
+ Overrides the vendor id reported by PoCL for the CPU drivers.
+ For example, setting the vendor id to be 32902 (0x8086) and setting the driver
+ version using **POCL_DRIVER_VER_OVERRIDE** to "2023.16.7.0.21_160000" (or such) can
+ be used to convince binary-distributed DPC++ compilers to compile and run SYCL
+ programs on the PoCL-CPU driver.
+
 - **POCL_DEBUG**
 
  Enables debug messages to stderr. This will be mostly messages from error
@@ -170,7 +178,8 @@ pocl.
 
  The old way (setting POCL_DEBUG to 1) has been updated to support categories.
  Using this limits the amount of debug messages produced. Current options are:
- error,warning,general,memory,llvm,events,cache,locking,refcounts,timing,hsa,tce,cuda,vulkan,proxy,all.
+ 'error', 'warning', 'general', 'memory', 'llvm', 'events', 'cache', 'locking',
+ 'refcounts', 'timing', 'hsa', 'tce', 'cuda', 'vulkan', 'proxy' and 'all'.
  Note: setting POCL_DEBUG to 1 still works and equals error+warning+general.
 
 - **POCL_DEBUG_LLVM_PASSES**
@@ -221,6 +230,11 @@ pocl.
  POCL_TTASIM0_PARAMETERS will be passed to the first ttasim driver instantiated
  and POCL_TTASIM1_PARAMETERS to the second one.
 
+- **POCL_DRIVER_VERSION_OVERRIDE**
+
+  Can be used to override the driver version reported by PoCL.
+  See **POCL_CPU_VENDOR_ID_OVERRIDE** for an example use case.
+
 - **POCL_EXTRA_BUILD_FLAGS**
 
  Adds the contents of the environment variable to all clBuildProgram() calls.
diff --git a/examples/boxadd/boxadd.c b/examples/boxadd/boxadd.c
index a89074d611..38ca19924d 100644
--- a/examples/boxadd/boxadd.c
+++ b/examples/boxadd/boxadd.c
@@ -112,5 +112,9 @@ main (int argc, char **argv)
   CHECK_CL_ERROR (clReleaseContext (context));
   CHECK_CL_ERROR (clUnloadPlatformCompiler (platform));
 
+  free (srcA);
+  free (srcB);
+  free (dst);
+
   return err;
 }
diff --git a/examples/matadd/matadd.c b/examples/matadd/matadd.c
index 2c54d597d1..09690ef5be 100644
--- a/examples/matadd/matadd.c
+++ b/examples/matadd/matadd.c
@@ -108,5 +108,9 @@ main (int argc, char **argv)
   CHECK_CL_ERROR (clReleaseContext (context));
   CHECK_CL_ERROR (clUnloadPlatformCompiler (platform));
 
+  free (srcA);
+  free (srcB);
+  free (dst);
+
   return err;
 }
diff --git a/include/pocl.h b/include/pocl.h
index 1714b02fd0..ea2e2e5276 100644
--- a/include/pocl.h
+++ b/include/pocl.h
@@ -47,9 +47,11 @@
 /* detects restrict, variadic macros etc */
 #include "pocl_compiler_features.h"
 
-/* The maximum file, directory and path name lengths. TODO: These should be
-   detected from the filesystem properties of the execution platform. */
-#define POCL_MAX_DIRNAME_LENGTH 255
+/* The maximum file, directory and path name lengths.
+   NOTE: GDB seems to fail to load symbols from .so files which have
+   longer pathnames than 511, thus the quite small dir/filename length
+   limiter. */
+#define POCL_MAX_DIRNAME_LENGTH 64
 #define POCL_MAX_FILENAME_LENGTH (POCL_MAX_DIRNAME_LENGTH)
 #define POCL_MAX_PATHNAME_LENGTH 4096
 
diff --git a/lib/CL/clCreateProgramWithIL.c b/lib/CL/clCreateProgramWithIL.c
index d6a7265026..90e3647834 100644
--- a/lib/CL/clCreateProgramWithIL.c
+++ b/lib/CL/clCreateProgramWithIL.c
@@ -128,16 +128,13 @@ CL_API_SUFFIX__VERSION_2_1
   POCL_GOTO_ERROR_COND ((length == 0), CL_INVALID_VALUE);
 
   int is_spirv = 0;
-#ifdef ENABLE_SPIRV
   int is_spirv_kernel
       = pocl_bitcode_is_spirv_execmodel_kernel ((const char *)il, length);
   is_spirv += is_spirv_kernel;
-#endif
-#ifdef ENABLE_VULKAN
+
   int is_spirv_shader
       = pocl_bitcode_is_spirv_execmodel_shader ((const char *)il, length);
   is_spirv += is_spirv_shader;
-#endif
 
   POCL_GOTO_ERROR_ON (
       (!is_spirv), CL_INVALID_VALUE,
diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c
index 41e456b052..c3e49aa59c 100644
--- a/lib/CL/clGetDeviceInfo.c
+++ b/lib/CL/clGetDeviceInfo.c
@@ -61,14 +61,14 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_IMAGE_SUPPORT:
     POCL_RETURN_GETINFO(cl_bool, device->image_support);
   case CL_DEVICE_TYPE:
-    POCL_RETURN_GETINFO(cl_device_type, device->type);   
+    POCL_RETURN_GETINFO (cl_device_type, device->type);
   case CL_DEVICE_VENDOR_ID:
     POCL_RETURN_GETINFO(cl_uint, device->vendor_id);
   case CL_DEVICE_MAX_COMPUTE_UNITS:
     POCL_RETURN_GETINFO(cl_uint, device->max_compute_units);
   case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          :
     POCL_RETURN_GETINFO(cl_uint, device->max_work_item_dimensions);
-  case CL_DEVICE_MAX_WORK_GROUP_SIZE               : 
+  case CL_DEVICE_MAX_WORK_GROUP_SIZE:
     {
       size_t max_wg_size = device->max_work_group_size;
       POCL_RETURN_GETINFO(size_t, max_wg_size);
@@ -342,7 +342,7 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT:
     POCL_RETURN_GETINFO (cl_bool, device->non_uniform_work_group_support);
   case CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT:
-    POCL_RETURN_GETINFO (cl_bool, CL_FALSE);
+    POCL_RETURN_GETINFO (cl_bool, device->wg_collective_func_support);
   case CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT:
     POCL_RETURN_GETINFO (cl_bool, device->generic_as_support);
   case CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES:
diff --git a/lib/CL/devices/common.c b/lib/CL/devices/common.c
index 40298250bd..66f54fed0d 100644
--- a/lib/CL/devices/common.c
+++ b/lib/CL/devices/common.c
@@ -1984,7 +1984,7 @@ pocl_setup_ils_with_version (cl_device_id dev)
     }
 }
 
-static const cl_name_version OPENCL_FEATURES[] = {
+static const cl_name_version OPENCL_C_FEATURES[] = {
   { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_3d_image_writes" },
   { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_images" },
   { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_read_write_images" },
@@ -2013,15 +2013,15 @@ static const cl_name_version OPENCL_FEATURES[] = {
   { CL_MAKE_VERSION (3, 0, 0), "__opencl_c_ext_fp64_local_atomic_min_max" },
 };
 
-const size_t OPENCL_FEATURES_NUM
-    = sizeof (OPENCL_FEATURES) / sizeof (OPENCL_FEATURES[0]);
+const size_t OPENCL_C_FEATURES_NUM
+    = sizeof (OPENCL_C_FEATURES) / sizeof (OPENCL_C_FEATURES[0]);
 
 void
 pocl_setup_features_with_version (cl_device_id dev)
 {
   cl_name_version *tmp = NULL;
   unsigned ret = pocl_space_delim_string_to_cl_name_version_array (
-      &tmp, dev->features, OPENCL_FEATURES, OPENCL_FEATURES_NUM);
+      &tmp, dev->features, OPENCL_C_FEATURES, OPENCL_C_FEATURES_NUM);
 
   dev->num_opencl_features_with_version = ret;
   dev->opencl_features_with_version = tmp;
diff --git a/lib/CL/devices/common_utils.c b/lib/CL/devices/common_utils.c
index 32551537b7..5e17acc5dd 100644
--- a/lib/CL/devices/common_utils.c
+++ b/lib/CL/devices/common_utils.c
@@ -105,9 +105,8 @@ align_ptr (char *p)
 
 #define FALLBACK_MAX_THREAD_COUNT 8
 
-/* initializes CPU-specific device info struct members, that cannot / should
+/* Initializes CPU-specific device info default, that cannot / should
    not be initialized in pocl_init_default_device_infos() */
-
 cl_int
 pocl_cpu_init_common (cl_device_id device)
 {
@@ -139,6 +138,7 @@ pocl_cpu_init_common (cl_device_id device)
   device->features = HOST_DEVICE_FEATURES_30;
   device->run_program_scope_variables_pass = CL_TRUE;
   device->generic_as_support = CL_TRUE;
+  device->wg_collective_func_support = CL_TRUE;
 
   pocl_setup_opencl_c_with_version (device, CL_TRUE);
   pocl_setup_features_with_version (device);
diff --git a/lib/CL/devices/cpuinfo.c b/lib/CL/devices/cpuinfo.c
index 4919915b34..ca6026f5f9 100644
--- a/lib/CL/devices/cpuinfo.c
+++ b/lib/CL/devices/cpuinfo.c
@@ -305,7 +305,8 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
   /* default vendor and vendor_id, in case it cannot be found by other means */
   device->vendor = cpuvendor_default;
   if (device->vendor_id == 0)
-    device->vendor_id = CL_KHRONOS_VENDOR_ID_POCL;
+    device->vendor_id = pocl_get_int_option ("POCL_CPU_VENDOR_ID_OVERRIDE",
+                                             CL_KHRONOS_VENDOR_ID_POCL);
 
   /* read contents of /proc/cpuinfo */
   if (access (cpuinfo, R_OK) != 0)
@@ -313,7 +314,7 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
 
   FILE *f = fopen (cpuinfo, "r");
   char contents[MAX_CPUINFO_SIZE];
-  int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f);            
+  int num_read = fread (contents, 1, MAX_CPUINFO_SIZE - 1, f);
   fclose(f);
   contents[num_read]='\0';
 
diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c
index ce227e8f14..ca7996f7f9 100644
--- a/lib/CL/devices/devices.c
+++ b/lib/CL/devices/devices.c
@@ -669,7 +669,9 @@ pocl_init_devices ()
              a shared global memory. */
           dev->global_mem_id = dev_index;
           POCL_INIT_OBJECT (dev);
-          dev->driver_version = POCL_VERSION_FULL;
+          dev->driver_version = pocl_get_string_option (
+              "POCL_DRIVER_VERSION_OVERRIDE", POCL_VERSION_FULL);
+
           if (dev->version == NULL)
             dev->version = "OpenCL 2.0 pocl";
 
diff --git a/lib/CL/devices/remote/remote.c b/lib/CL/devices/remote/remote.c
index 6d89cc3364..a589eb7146 100644
--- a/lib/CL/devices/remote/remote.c
+++ b/lib/CL/devices/remote/remote.c
@@ -686,7 +686,7 @@ setup_relevant_devices (cl_program program, cl_device_id device,
   remote_server_data_t *server
       = ((remote_device_data_t *)device->data)->server;
   unsigned num_relevant_devices = 0;
-  char program_bc_path[POCL_MAX_FILENAME_LENGTH];
+  char program_bc_path[POCL_MAX_PATHNAME_LENGTH];
   unsigned i, j;
 
   for (i = 0; i < program->num_devices; ++i)
diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c
index ecf5070fd1..a620694a30 100644
--- a/lib/CL/pocl_cache.c
+++ b/lib/CL/pocl_cache.c
@@ -92,12 +92,11 @@ void pocl_cache_program_path(char*        path,
   program_device_dir (path, program, device_i, "");
 }
 
-// required in llvm API
 void pocl_cache_program_bc_path(char*        program_bc_path,
                                 cl_program   program,
                                 unsigned     device_i) {
-    program_device_dir(program_bc_path, program,
-                       device_i, POCL_PROGRAM_BC_FILENAME);
+  program_device_dir (program_bc_path, program,
+                      device_i, POCL_PROGRAM_BC_FILENAME);
 }
 
 void
@@ -208,9 +207,9 @@ pocl_cache_kernel_cachedir (char *kernel_cachedir_path, cl_program program,
 {
   int bytes_written;
   char tempstring[POCL_MAX_PATHNAME_LENGTH];
-  char file_name[POCL_MAX_DIRNAME_LENGTH + 1];
+  char file_name[POCL_MAX_FILENAME_LENGTH + 1];
 
-  pocl_hash_clipped_name (kernel_name, POCL_MAX_DIRNAME_LENGTH, &file_name[0]);
+  pocl_hash_clipped_name (kernel_name, POCL_MAX_FILENAME_LENGTH, &file_name[0]);
 
   bytes_written
       = snprintf (tempstring, POCL_MAX_PATHNAME_LENGTH, "/%s", file_name);
diff --git a/lib/CL/pocl_cl.h b/lib/CL/pocl_cl.h
index 8dd114868d..f0881f4501 100644
--- a/lib/CL/pocl_cl.h
+++ b/lib/CL/pocl_cl.h
@@ -853,6 +853,7 @@ struct _cl_device_id {
   size_t preferred_wg_size_multiple;
   cl_bool non_uniform_work_group_support;
   cl_bool generic_as_support;
+  cl_bool wg_collective_func_support;
   cl_uint preferred_vector_width_char;
   cl_uint preferred_vector_width_short;
   cl_uint preferred_vector_width_int;
diff --git a/lib/kernel/host/CMakeLists.txt b/lib/kernel/host/CMakeLists.txt
index 77d34b479b..27b3d3959b 100644
--- a/lib/kernel/host/CMakeLists.txt
+++ b/lib/kernel/host/CMakeLists.txt
@@ -139,6 +139,7 @@ vload_store_half_f16c.c
 vstore.cl
 vstore_half.cl
 wait_group_events.cl
+work_group.c
 write_image.cl
 
 ###################################################################
diff --git a/lib/kernel/subgroups.c b/lib/kernel/subgroups.c
index be48fc59e3..a836f92ab9 100644
--- a/lib/kernel/subgroups.c
+++ b/lib/kernel/subgroups.c
@@ -29,37 +29,11 @@
 
 #include <math.h>
 
-/**
- * \brief Internal pseudo function which allocates space from the work-group
- * thread's stack (basically local memory) for each work-item.
- *
- * It's expanded in WorkitemLoops.cc to an alloca().
- *
- * @param element_size The size of an element to allocate (for all WIs in the
- * WG).
- * @param align The alignment of the start of chunk.
- * @param extra_bytes extra bytes to add to the allocation, some functions need
- * extra space
- * @return pointer to the allocated stack space (freed at unwind).
- */
-void *__pocl_work_group_alloca (size_t element_size, size_t align,
-                                size_t extra_bytes);
-
-/**
- * \brief Internal pseudo function which allocates space from the work-group
- * thread's stack (basically local memory).
- *
- * It's expanded in WorkitemLoops.cc to an alloca().
- *
- * @param bytes The size of data to allocate in bytes.
- * @param align The alignment of the start of chunk.
- * @return pointer to the allocated stack space (freed at unwind).
- */
-void *__pocl_local_mem_alloca (size_t bytes, size_t align);
-
-size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx);
+#include "work_group_alloca.h"
 
 size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx);
+size_t _CL_OVERLOADABLE get_local_linear_id (void);
+size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx);
 
 /* Magic variable that is expanded in Workgroup.cc */
 extern uint _pocl_sub_group_size;
@@ -89,8 +63,6 @@ get_enqueued_num_sub_groups (void)
   return 1;
 }
 
-size_t _CL_OVERLOADABLE get_local_linear_id (void);
-
 uint _CL_OVERLOADABLE
 get_sub_group_id (void)
 {
diff --git a/lib/kernel/work_group.c b/lib/kernel/work_group.c
new file mode 100644
index 0000000000..c68f013360
--- /dev/null
+++ b/lib/kernel/work_group.c
@@ -0,0 +1,228 @@
+/* OpenCL built-in library: work-group collective functions
+
+   Copyright (c) 2024 Pekka Jääskeläinen / Intel Finland Oy
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to
+   deal in the Software without restriction, including without limitation the
+   rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+   sell copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+   IN THE SOFTWARE.
+*/
+
+#include "work_group_alloca.h"
+#include <math.h>
+#include <stdio.h>
+
+size_t _CL_OVERLOADABLE get_local_id (unsigned int dimindx);
+size_t _CL_OVERLOADABLE get_local_linear_id (void);
+size_t _CL_OVERLOADABLE get_local_size (unsigned int dimindx);
+void _CL_OVERLOADABLE
+    POCL_BUILTIN_PREFIX (work_group_barrier) (cl_mem_fence_flags flags);
+
+#define work_group_barrier POCL_BUILTIN_PREFIX (work_group_barrier)
+
+/* Align the stack temporary data by this multiple to facilitate easier
+   vectorization. */
+#define ALIGN_ELEMENT_MULTIPLE 32
+
+static size_t
+get_total_local_size ()
+{
+  return get_local_size (0) * get_local_size (1) * get_local_size (2);
+}
+
+#define WORK_GROUP_SHUFFLE_PT(PREFIX, TYPE)                                   \
+  __attribute__ ((always_inline)) static TYPE _CL_OVERLOADABLE                \
+      PREFIX##work_group_shuffle (TYPE val, size_t id)                        \
+  {                                                                           \
+    volatile TYPE *temp_storage = __pocl_work_group_alloca (                  \
+        sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0);            \
+    temp_storage[get_local_linear_id ()] = val;                               \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    return temp_storage[id % get_total_local_size ()];                        \
+  }
+
+/* Define both the non-prefixed (khr) and Intel-prefixed shuffles. */
+#define WORK_GROUP_SHUFFLE_T(TYPE) WORK_GROUP_SHUFFLE_PT (, TYPE)
+
+WORK_GROUP_SHUFFLE_T (char)
+WORK_GROUP_SHUFFLE_T (uchar)
+WORK_GROUP_SHUFFLE_T (short)
+WORK_GROUP_SHUFFLE_T (ushort)
+WORK_GROUP_SHUFFLE_T (int)
+WORK_GROUP_SHUFFLE_T (uint)
+WORK_GROUP_SHUFFLE_T (long)
+WORK_GROUP_SHUFFLE_T (ulong)
+WORK_GROUP_SHUFFLE_T (float)
+WORK_GROUP_SHUFFLE_T (double)
+
+#define WORK_GROUP_BROADCAST_T(TYPE)                                          \
+  __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE                       \
+  work_group_broadcast (TYPE val, size_t x)                                   \
+  {                                                                           \
+    return work_group_shuffle (val, x);                                       \
+  }                                                                           \
+  __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE                       \
+  work_group_broadcast (TYPE val, size_t x, size_t y)                         \
+  {                                                                           \
+    return work_group_shuffle (val, y * get_local_size (0) + x);              \
+  }                                                                           \
+  __attribute__ ((always_inline)) TYPE _CL_OVERLOADABLE                       \
+  work_group_broadcast (TYPE val, size_t x, size_t y, size_t z)               \
+  {                                                                           \
+    return work_group_shuffle (val,                                           \
+                               z * get_local_size (1) * get_local_size (0)    \
+                                   + y * get_local_size (0) + x);             \
+  }
+
+WORK_GROUP_BROADCAST_T (int)
+WORK_GROUP_BROADCAST_T (uint)
+WORK_GROUP_BROADCAST_T (long)
+WORK_GROUP_BROADCAST_T (ulong)
+WORK_GROUP_BROADCAST_T (float)
+WORK_GROUP_BROADCAST_T (double)
+
+#define WORK_GROUP_REDUCE_OT(OPNAME, OPERATION, TYPE)                         \
+  __attribute__ ((always_inline))                                             \
+  TYPE _CL_OVERLOADABLE work_group_reduce_##OPNAME (TYPE val)                 \
+  {                                                                           \
+    volatile TYPE *temp_storage = __pocl_work_group_alloca (                  \
+        sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0);            \
+    temp_storage[get_local_linear_id ()] = val;                               \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    if (get_local_linear_id () == 0)                                          \
+      {                                                                       \
+        for (uint i = 1; i < get_total_local_size (); ++i)                    \
+          {                                                                   \
+            TYPE a = temp_storage[0], b = temp_storage[i];                    \
+            temp_storage[0] = OPERATION;                                      \
+          }                                                                   \
+      }                                                                       \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    return temp_storage[0];                                                   \
+  }
+
+#define WORK_GROUP_REDUCE_T(OPNAME, OPERATION)                                \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, int)                               \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, uint)                              \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, long)                              \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, ulong)                             \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, float)                             \
+  WORK_GROUP_REDUCE_OT (OPNAME, OPERATION, double)
+
+WORK_GROUP_REDUCE_T (add, a + b)
+WORK_GROUP_REDUCE_T (min, a > b ? b : a)
+WORK_GROUP_REDUCE_T (max, a > b ? a : b)
+
+#define WORK_GROUP_SCAN_INCLUSIVE_OT(OPNAME, OPERATION, TYPE)                 \
+  __attribute__ ((always_inline))                                             \
+  TYPE _CL_OVERLOADABLE work_group_scan_inclusive_##OPNAME (TYPE val)         \
+  {                                                                           \
+    volatile TYPE *data = __pocl_work_group_alloca (                          \
+        sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE), 0);            \
+    data[get_local_linear_id ()] = val;                                       \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    if (get_local_linear_id () == 0)                                          \
+      {                                                                       \
+        for (uint i = 1; i < get_total_local_size (); ++i)                    \
+          {                                                                   \
+            TYPE a = data[i - 1], b = data[i];                                \
+            data[i] = OPERATION;                                              \
+          }                                                                   \
+      }                                                                       \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    return data[get_local_linear_id ()];                                      \
+  }
+
+#define WORK_GROUP_SCAN_INCLUSIVE_T(OPNAME, OPERATION)                        \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, int)                       \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, uint)                      \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, long)                      \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, ulong)                     \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, float)                     \
+  WORK_GROUP_SCAN_INCLUSIVE_OT (OPNAME, OPERATION, double)
+
+WORK_GROUP_SCAN_INCLUSIVE_T (add, a + b)
+WORK_GROUP_SCAN_INCLUSIVE_T (min, a > b ? b : a)
+WORK_GROUP_SCAN_INCLUSIVE_T (max, a > b ? a : b)
+
+#define WORK_GROUP_SCAN_EXCLUSIVE_OT(OPNAME, OPERATION, TYPE, ID)             \
+  __attribute__ ((always_inline))                                             \
+  TYPE _CL_OVERLOADABLE work_group_scan_exclusive_##OPNAME (TYPE val)         \
+  {                                                                           \
+    volatile TYPE *data = __pocl_work_group_alloca (                          \
+        sizeof (TYPE), ALIGN_ELEMENT_MULTIPLE * sizeof (TYPE),                \
+        sizeof (TYPE));                                                       \
+    data[get_local_linear_id () + 1] = val;                                   \
+    data[0] = ID;                                                             \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    if (get_local_linear_id () == 0)                                          \
+      {                                                                       \
+        for (uint i = 1; i < get_total_local_size (); ++i)                    \
+          {                                                                   \
+            TYPE a = data[i - 1], b = data[i];                                \
+            data[i] = OPERATION;                                              \
+          }                                                                   \
+      }                                                                       \
+    work_group_barrier (CLK_LOCAL_MEM_FENCE);                                 \
+    return data[get_local_linear_id ()];                                      \
+  }
+
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, int, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, uint, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, long, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, ulong, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, float, 0.0f)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (add, a + b, double, 0.0)
+
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, int, INT_MAX)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, uint, UINT_MAX)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, long, LONG_MAX)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, ulong, ULONG_MAX)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, float, +INFINITY)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (min, a > b ? b : a, double, +INFINITY)
+
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, int, INT_MIN)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, uint, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, long, LONG_MIN)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, ulong, 0)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, float, -INFINITY)
+WORK_GROUP_SCAN_EXCLUSIVE_OT (max, a > b ? a : b, double, -INFINITY)
+
+__attribute__ ((always_inline)) int _CL_OVERLOADABLE
+work_group_any (int predicate)
+{
+  /* The results for all of the WIs. */
+  int *flags = __pocl_work_group_alloca (
+      sizeof (int), ALIGN_ELEMENT_MULTIPLE * sizeof (int), 0);
+  /* The final result. */
+  flags[get_local_linear_id ()] = !!predicate;
+  int *result = __pocl_work_group_alloca (sizeof (int), sizeof (int), 0);
+  work_group_barrier (CLK_LOCAL_MEM_FENCE);
+  if (get_local_linear_id () == 0)
+    {
+      *result = 0;
+      for (uint i = 0; i < get_total_local_size (); ++i)
+        *result |= flags[i];
+    }
+  work_group_barrier (CLK_LOCAL_MEM_FENCE);
+  return *result;
+}
+
+__attribute__ ((always_inline)) int _CL_OVERLOADABLE
+work_group_all (int predicate)
+{
+  return !work_group_any (!!predicate);
+}
diff --git a/lib/kernel/work_group_alloca.h b/lib/kernel/work_group_alloca.h
new file mode 100644
index 0000000000..1cff7e5cb8
--- /dev/null
+++ b/lib/kernel/work_group_alloca.h
@@ -0,0 +1,50 @@
+/* OpenCL built-in library: internal work group memory allocation functionality
+
+   Copyright (c) 2022-2023 Pekka Jääskeläinen / Intel Finland Oy
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to
+   deal in the Software without restriction, including without limitation the
+   rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+   sell copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+   IN THE SOFTWARE.
+*/
+
+/**
+ * \brief Internal pseudo function which allocates space from the work-group
+ * thread's stack (basically local memory) for each work-item.
+ *
+ * It's expanded in WorkitemLoops.cc to an alloca().
+ *
+ * @param element_size The size of an element to allocate (for all WIs in the
+ * WG).
+ * @param align The alignment of the start of chunk.
+ * @param extra_bytes extra bytes to add to the allocation, some functions need
+ * extra space
+ * @return pointer to the allocated stack space (freed at unwind).
+ */
+void *__pocl_work_group_alloca (size_t element_size, size_t align,
+                                size_t extra_bytes);
+
+/**
+ * \brief Internal pseudo function which allocates space from the work-group
+ * thread's stack (basically local memory).
+ *
+ * It's expanded in WorkitemLoops.cc to an alloca().
+ *
+ * @param bytes The size of data to allocate in bytes.
+ * @param align The alignment of the start of chunk.
+ * @return pointer to the allocated stack space (freed at unwind).
+ */
+void *__pocl_local_mem_alloca (size_t bytes, size_t align);
diff --git a/pocld/shared_cl_context.cc b/pocld/shared_cl_context.cc
index 2e459fdf79..f5f7994cce 100644
--- a/pocld/shared_cl_context.cc
+++ b/pocld/shared_cl_context.cc
@@ -1260,9 +1260,12 @@ bool createSPIRVWithSVMOffset(const std::vector<unsigned char> *InputSPV,
     LibPoCLPath /=
         std::filesystem::path(BUILDDIR) / "lib" / "CL" / "libpocl.so";
 
+  // Without -strip-debug there might be crashes due to llvm-spirv
+  // not detecting its own produced debug output sometimes (to
+  // report).
   OptCmd << LLVM_OPT << " -load-pass-plugin=" << LibPoCLPath
-         << " -passes=svm-offset -svm-offset-value=" << SVMOffset << " "
-         << OrigBcFileName << " -o " << OffsettedBcFileName;
+         << " -strip-debug -passes=svm-offset -svm-offset-value=" << SVMOffset
+         << " " << OrigBcFileName << " -o " << OffsettedBcFileName;
 
   if (system(OptCmd.str().c_str()) != EXIT_SUCCESS)
     return false;
diff --git a/tests/regression/test_llvm_segfault_issue_889.c b/tests/regression/test_llvm_segfault_issue_889.c
index 7ed2c1b067..4d39cad962 100644
--- a/tests/regression/test_llvm_segfault_issue_889.c
+++ b/tests/regression/test_llvm_segfault_issue_889.c
@@ -114,7 +114,7 @@ main ()
     printf ("binary size [%zd]: %zd\n", i, binsizes[i]);
 
   CHECK_CL_ERROR (clReleaseProgram (program));
-
+  CHECK_CL_ERROR (clReleaseCommandQueue (command_queue));
   CHECK_CL_ERROR (clReleaseContext (context));
 
   printf ("OK\n");
diff --git a/tests/workgroup/run_kernel.c b/tests/workgroup/run_kernel.c
index 7a7da4a581..27f6bf6389 100644
--- a/tests/workgroup/run_kernel.c
+++ b/tests/workgroup/run_kernel.c
@@ -152,6 +152,9 @@ main (int argc, char **argv)
   if (context)
     clReleaseContext (context);
 
+  free (source);
+  free (devices);
+
   if (err == CL_SUCCESS)
     {
       printf ("OK\n");
diff --git a/tools/scripts/format-diff.sh b/tools/scripts/format-diff.sh
index 4fec5bd29f..f9e079559e 100755
--- a/tools/scripts/format-diff.sh
+++ b/tools/scripts/format-diff.sh
@@ -15,7 +15,7 @@ pushd ${GITROOT} > /dev/null
 PATCHY=$(mktemp /tmp/pocl.XXXXXXXX.patch)
 trap "rm -f $PATCHY" EXIT
 
-git diff -U0 --no-color >$PATCHY
+git diff $* -U0 --no-color >$PATCHY
 
 $RELPATH/clang-format-diff.py -regex '.*(\.h$|\.c$|\.cl$)' -i -p1 -style GNU <$PATCHY
 $RELPATH/clang-format-diff.py -regex '(.*(\.hpp$|\.hh$|\.cc$|\.cpp$))|(lib/llvmopencl/.*)|(lib/CL/devices/tce/.*)' -i -p1 -style LLVM <$PATCHY