Skip to content

Commit

Permalink
Merge pull request #1429 from franz/intel_sg_shuffle_block
Browse files Browse the repository at this point in the history
implement additional bits of cl_intel_subgroup_shuffle
  • Loading branch information
pjaaskel committed Feb 23, 2024
2 parents 2a7d9af + 34e22b9 commit 9d0aed7
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 18 deletions.
10 changes: 5 additions & 5 deletions CMakeLists.txt
Expand Up @@ -1305,15 +1305,15 @@ cl_exp_pinned_buffers")
set(HOST_DEVICE_FEATURES_30 "__opencl_c_3d_image_writes __opencl_c_images \
__opencl_c_atomic_order_acq_rel __opencl_c_atomic_order_seq_cst \
__opencl_c_atomic_scope_device __opencl_c_program_scope_global_variables \
__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space \
__opencl_c_read_write_images")
__opencl_c_atomic_scope_all_devices __opencl_c_generic_address_space")

# Host CPU device: extensions only enabled when conformance is OFF
if(NOT ENABLE_CONFORMANCE)
set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_subgroup_ballot \
cl_khr_subgroup_shuffle cl_intel_subgroups cl_intel_required_subgroup_size \
cl_ext_float_atomics")
# set( HOST_DEVICE_FEATURES_30 "${HOST_DEVICE_FEATURES_30} __opencl_c_read_write_images")
cl_khr_subgroup_shuffle cl_intel_subgroups cl_intel_subgroups_short \
cl_ext_float_atomics cl_intel_required_subgroup_size")
# read-write images are still partially broken
set( HOST_DEVICE_FEATURES_30 "${HOST_DEVICE_FEATURES_30} __opencl_c_read_write_images")
endif()

# Extensions that are considered feature-complete (preferably CTS-tested).
Expand Down
1 change: 1 addition & 0 deletions lib/CL/devices/common.c
Expand Up @@ -1832,6 +1832,7 @@ pocl_setup_opencl_c_with_version (cl_device_id dev, int supports_30)
static const cl_name_version OPENCL_EXTENSIONS[]
= { { CL_MAKE_VERSION (1, 0, 0), "cl_intel_required_subgroup_size" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_intel_subgroups" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_intel_subgroups_short" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_intel_unified_shared_memory" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_byte_addressable_store" },
{ CL_MAKE_VERSION (1, 0, 0), "cl_khr_global_int32_base_atomics" },
Expand Down
88 changes: 75 additions & 13 deletions lib/kernel/subgroups.cl
Expand Up @@ -60,13 +60,42 @@ sub_group_all (int predicate)
return sub_group_reduce_min ((unsigned)predicate);
}

#ifdef cl_intel_subgroups

uint _CL_OVERLOADABLE
intel_sub_group_shuffle_down (uint current, uint next, uint delta)
{
int idx = get_sub_group_local_id () + delta;
uint cur_idx = (idx >= get_max_sub_group_size ()) ? 0 : idx;
uint other_cur = sub_group_shuffle (current, cur_idx);
int next_idx
= (idx > get_max_sub_group_size ()) ? idx - get_sub_group_size () : 0;
uint other_next = sub_group_shuffle (next, next_idx);
return idx >= get_sub_group_size () ? other_cur : other_next;
}

uint _CL_OVERLOADABLE
intel_sub_group_block_read (const global uint *p)
{
return p[get_sub_group_local_id ()];
}

uint2 _CL_OVERLOADABLE
intel_sub_group_block_read2 (const global uint *p)
{
return (uint2)(p[get_sub_group_local_id ()],
p[get_sub_group_local_id () + get_max_sub_group_size ()]);
}

uint4 _CL_OVERLOADABLE
intel_sub_group_block_read4 (const global uint *p)
{
uint sglid = get_sub_group_local_id ();
uint sgsize = get_max_sub_group_size ();
return (uint4)(p[sglid], p[sglid + sgsize], p[sglid + 2 * sgsize],
p[sglid + 3 * sgsize]);
}

uint8 _CL_OVERLOADABLE
intel_sub_group_block_read8 (const global uint *p)
{
Expand All @@ -78,7 +107,52 @@ intel_sub_group_block_read8 (const global uint *p)
p[sglid + 7 * sgsize]);
}

#ifdef cl_intel_subgroups
void _CL_OVERLOADABLE
intel_sub_group_block_write (global uint *p, uint data)
{
uint sglid = get_sub_group_local_id ();
uint sgsize = get_max_sub_group_size ();
p[sglid] = data;
}

void _CL_OVERLOADABLE
intel_sub_group_block_write2 (global uint *p, uint2 data)
{
uint sglid = get_sub_group_local_id ();
uint sgsize = get_max_sub_group_size ();
p[sglid] = data.x;
p[sglid + sgsize] = data.y;
}

void _CL_OVERLOADABLE
intel_sub_group_block_write4 (global uint *p, uint4 data)
{
uint sglid = get_sub_group_local_id ();
uint sgsize = get_max_sub_group_size ();
p[sglid] = data.s0;
p[sglid + sgsize] = data.s1;
p[sglid + 2 * sgsize] = data.s2;
p[sglid + 3 * sgsize] = data.s3;
}

void _CL_OVERLOADABLE
intel_sub_group_block_write8 (global uint *p, uint8 data)
{
uint sglid = get_sub_group_local_id ();
uint sgsize = get_max_sub_group_size ();
p[sglid] = data.s0;
p[sglid + sgsize] = data.s1;
p[sglid + 2 * sgsize] = data.s2;
p[sglid + 3 * sgsize] = data.s3;
p[sglid + 4 * sgsize] = data.s4;
p[sglid + 5 * sgsize] = data.s5;
p[sglid + 6 * sgsize] = data.s6;
p[sglid + 7 * sgsize] = data.s7;
}

#endif

#ifdef cl_intel_subgroups_short
/* https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_short.html
*/
ushort8 _CL_OVERLOADABLE
Expand All @@ -91,16 +165,4 @@ intel_sub_group_block_read_us8 (const global ushort *p)
p[sglid + 5 * sgsize], p[sglid + 6 * sgsize],
p[sglid + 7 * sgsize]);
}

uint _CL_OVERLOADABLE
intel_sub_group_shuffle_down (uint current, uint next, uint delta)
{
int idx = get_sub_group_local_id () + delta;
uint cur_idx = (idx >= get_max_sub_group_size ()) ? 0 : idx;
uint other_cur = sub_group_shuffle (current, cur_idx);
int next_idx
= (idx > get_max_sub_group_size ()) ? idx - get_sub_group_size () : 0;
uint other_next = sub_group_shuffle (next, next_idx);
return idx >= get_sub_group_size () ? other_cur : other_next;
}
#endif

0 comments on commit 9d0aed7

Please sign in to comment.