Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable GPU thread/block index types, minor fixes to integer code generation and GPU runtimes #1357

Merged
merged 8 commits into from
Sep 8, 2023
1 change: 1 addition & 0 deletions dace/codegen/compiled_sdfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def get_workspace_sizes(self) -> Dict[dtypes.StorageType, int]:
result: Dict[dtypes.StorageType, int] = {}
for storage in self.external_memory_types:
func = self._lib.get_symbol(f'__dace_get_external_memory_size_{storage.name}')
func.restype = ctypes.c_size_t
result[storage] = func(self._libhandle, *self._lastargs[1])

return result
Expand Down
31 changes: 31 additions & 0 deletions dace/codegen/cppunparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
import numpy as np
import os
import tokenize
import warnings

import sympy
import dace
Expand Down Expand Up @@ -733,6 +734,21 @@ def _Num(self, t):
if isinstance(t.n, complex):
dtype = dtypes.DTYPE_TO_TYPECLASS[complex]

# Handle large integer values
if isinstance(t.n, int):
bits = t.n.bit_length()
if bits == 32: # Integer, potentially unsigned
if t.n >= 0: # unsigned
repr_n += 'U'
else: # signed, 64-bit
repr_n += 'LL'
elif 32 < bits <= 63:
repr_n += 'LL'
elif bits == 64 and t.n >= 0:
repr_n += 'ULL'
elif bits >= 64:
warnings.warn(f'Value wider than 64 bits encountered in expression ({t.n}), emitting as-is')

if repr_n.endswith("j"):
self.write("%s(0, %s)" % (dtype, repr_n.replace("inf", INFSTR)[:-1]))
else:
Expand Down Expand Up @@ -831,8 +847,23 @@ def _Tuple(
self.write(")")

unop = {"Invert": "~", "Not": "!", "UAdd": "+", "USub": "-"}
unop_lambda = {'Invert': (lambda x: ~x), 'Not': (lambda x: not x), 'UAdd': (lambda x: +x), 'USub': (lambda x: -x)}

def _UnaryOp(self, t):
# Dispatch constants after applying the operation
if sys.version_info[:2] < (3, 8):
if isinstance(t.operand, ast.Num):
newval = self.unop_lambda[t.op.__class__.__name__](t.operand.n)
newnode = ast.Num(n=newval)
self.dispatch(newnode)
return
else:
if isinstance(t.operand, ast.Constant):
newval = self.unop_lambda[t.op.__class__.__name__](t.operand.value)
newnode = ast.Constant(value=newval)
self.dispatch(newnode)
return

self.write("(")
self.write(self.unop[t.op.__class__.__name__])
self.write(" ")
Expand Down
15 changes: 11 additions & 4 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1939,6 +1939,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream):
node = dfg_scope.source_nodes()[0]

# Get the thread/block index type
ttype = Config.get('compiler', 'cuda', 'thread_id_type')
tidtype = getattr(dtypes, ttype, False)
if not isinstance(tidtype, dtypes.typeclass):
raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. '
'See ``dace.dtypes`` for available types (for example ``int32``).')

# allocating shared memory for dynamic threadblock maps
if has_dtbmap:
kernel_stream.write(
Expand Down Expand Up @@ -1990,8 +1997,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_

expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)

kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)

# Delinearize beyond the third dimension
if len(krange) > 3:
Expand All @@ -2010,8 +2017,8 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_
)

expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr)
kernel_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int')
kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node)
self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype)

# Dispatch internal code
assert CUDACodeGen._in_device_code is False
Expand Down
11 changes: 11 additions & 0 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,17 @@ required:
a specified larger block size in the third dimension. Default value is
derived from hardware limits on common GPUs.

thread_id_type:
type: str
title: Thread/block index data type
default: int32
description: >
Defines the data type for a thread and block index in the generated code.
The type is based on the type-classes in ``dace.dtypes``. For example,
``uint64`` is equivalent to ``dace.uint64``. Change this setting when large
index types are needed to address memory offsets that are beyond the 32-bit
range, or to reduce memory usage.


#############################################
# General FPGA flags
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/blas/environments/cublas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class cuBLAS:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/blas/environments/rocblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class rocBLAS:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
12 changes: 8 additions & 4 deletions dace/libraries/blas/include/dace_cublas.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ static void CheckCublasError(cublasStatus_t const& status) {
}

static cublasHandle_t CreateCublasHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cublasHandle_t handle;
CheckCublasError(cublasCreate(&handle));
Expand Down Expand Up @@ -65,8 +67,10 @@ class _CublasConstants {
}

_CublasConstants(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
// Allocate constant zero with the largest used size
cudaMalloc(&zero_, sizeof(cuDoubleComplex) * 1);
Expand Down
60 changes: 32 additions & 28 deletions dace/libraries/blas/include/dace_rocblas.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ static void CheckRocblasError(rocblas_status const& status) {
}

static rocblas_handle CreateRocblasHandle(int device) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
if (device >= 0) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
}
}
rocblas_handle handle;
CheckRocblasError(rocblas_create_handle(&handle));
Expand Down Expand Up @@ -68,53 +70,55 @@ class _RocblasConstants {
}

_RocblasConstants(int device) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
if (device >= 0) {
if (hipSetDevice(device) != hipSuccess) {
throw std::runtime_error("Failed to set HIP device.");
}
}
// Allocate constant zero with the largest used size
hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&zero_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(zero_, 0, sizeof(hipDoubleComplex) * 1);

// Allocate constant one
hipMalloc(&half_pone_, sizeof(__half) * 1);
(void)hipMalloc(&half_pone_, sizeof(__half) * 1);
__half half_pone = __float2half(1.0f);
hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
(void)hipMemcpy(half_pone_, &half_pone, sizeof(__half) * 1,
hipMemcpyHostToDevice);
hipMalloc(&float_pone_, sizeof(float) * 1);
(void)hipMalloc(&float_pone_, sizeof(float) * 1);
float float_pone = 1.0f;
hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
(void)hipMemcpy(float_pone_, &float_pone, sizeof(float) * 1,
hipMemcpyHostToDevice);
hipMalloc(&double_pone_, sizeof(double) * 1);
(void)hipMalloc(&double_pone_, sizeof(double) * 1);
double double_pone = 1.0;
hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
(void)hipMemcpy(double_pone_, &double_pone, sizeof(double) * 1,
hipMemcpyHostToDevice);
hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
(void)hipMalloc(&complex64_pone_, sizeof(hipComplex) * 1);
hipComplex complex64_pone = make_hipFloatComplex(1.0f, 0.0f);
hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
(void)hipMemcpy(complex64_pone_, &complex64_pone, sizeof(hipComplex) * 1,
hipMemcpyHostToDevice);
hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&complex128_pone_, sizeof(hipDoubleComplex) * 1);
hipDoubleComplex complex128_pone = make_hipDoubleComplex(1.0, 0.0);
hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
(void)hipMemcpy(complex128_pone_, &complex128_pone, sizeof(hipDoubleComplex) * 1,
hipMemcpyHostToDevice);

// Allocate custom factors and default to zero
hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&custom_alpha_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(custom_alpha_, 0, sizeof(hipDoubleComplex) * 1);
(void)hipMalloc(&custom_beta_, sizeof(hipDoubleComplex) * 1);
(void)hipMemset(custom_beta_, 0, sizeof(hipDoubleComplex) * 1);
}

_RocblasConstants(_RocblasConstants const&) = delete;

~_RocblasConstants() {
hipFree(zero_);
hipFree(half_pone_);
hipFree(float_pone_);
hipFree(double_pone_);
hipFree(complex64_pone_);
hipFree(complex128_pone_);
hipFree(custom_alpha_);
hipFree(custom_beta_);
(void)hipFree(zero_);
(void)hipFree(half_pone_);
(void)hipFree(float_pone_);
(void)hipFree(double_pone_);
(void)hipFree(complex64_pone_);
(void)hipFree(complex128_pone_);
(void)hipFree(custom_alpha_);
(void)hipFree(custom_beta_);
}

_RocblasConstants& operator=(_RocblasConstants const&) = delete;
Expand Down
16 changes: 8 additions & 8 deletions dace/libraries/blas/nodes/gemm.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ def expansion(node, state, sdfg):
code = ''
if dtype in (dace.complex64, dace.complex128):
code = f'''
{dtype.ctype} alpha = {alpha};
{dtype.ctype} beta = {beta};
{dtype.ctype} __alpha = {alpha};
{dtype.ctype} __beta = {beta};
'''
opt['alpha'] = '&alpha'
opt['beta'] = '&beta'
opt['alpha'] = '&__alpha'
opt['beta'] = '&__beta'

code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, "
"{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, "
Expand Down Expand Up @@ -287,12 +287,12 @@ def expansion(cls, node, state, sdfg):

# Set pointer mode to host
call_prefix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_host});
{dtype.ctype} alpha = {alpha};
{dtype.ctype} beta = {beta};
{dtype.ctype} __alpha = {alpha};
{dtype.ctype} __beta = {beta};
'''
call_suffix += f'''{cls.set_pointer_mode}(__dace_{cls.backend}blas_handle, {cls.pointer_device});'''
alpha = f'({cdtype} *)&alpha'
beta = f'({cdtype} *)&beta'
alpha = f'({cdtype} *)&__alpha'
beta = f'({cdtype} *)&__beta'
else:
alpha = constants[node.alpha]
beta = constants[node.beta]
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/lapack/environments/cusolverdn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuSolverDn:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/lapack/include/dace_cusolverdn.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ static void CheckCusolverDnError(cusolverStatus_t const& status) {
}

static cusolverDnHandle_t CreateCusolverDnHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cusolverDnHandle_t handle;
CheckCusolverDnError(cusolverDnCreate(&handle));
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/linalg/environments/cutensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuTensor:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/linalg/include/dace_cutensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ static void CheckCuTensorError(cutensorStatus_t const& status) {
}

static cutensorHandle_t CreateCuTensorHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cutensorHandle_t handle;
CheckCuTensorError(cutensorInit(&handle));
Expand Down
2 changes: 1 addition & 1 deletion dace/libraries/sparse/environments/cusparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class cuSPARSE:
def handle_setup_code(node):
location = node.location
if not location or "gpu" not in node.location:
location = 0
location = -1 # -1 means current device
else:
try:
location = int(location["gpu"])
Expand Down
6 changes: 4 additions & 2 deletions dace/libraries/sparse/include/dace_cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ static void CheckCusparseError(cusparseStatus_t const& status) {
}

static cusparseHandle_t CreateCusparseHandle(int device) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
if (device >= 0) {
if (cudaSetDevice(device) != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device.");
}
}
cusparseHandle_t handle;
CheckCusparseError(cusparseCreate(&handle));
Expand Down