Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: GPU aware RMA #3875

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
45 changes: 38 additions & 7 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ INSTALL_DATA_LOCAL_TARGETS =
doc1_src_txt =
doc3_src_txt =


# add (+=) target names to this variable to add them to the dependencies of the
# 'clean-local' target
CLEAN_LOCAL_TARGETS =
Expand Down Expand Up @@ -139,21 +140,25 @@ mpi_fc_sources =
mpi_fc_modules =
mpi_cxx_sources =
mpi_core_sources =
cuda_sources =
cuda_objects =

lib_LTLIBRARIES =
lib_LIBRARIES =

# include our subdir automake fragments
include maint/Makefile.mk
include src/Makefile.mk

#AM_CPPFLAGS += $(cuda_objects)
if BUILD_PROFILING_LIB
# dropping mpi_fc_sources and mpi_cxx_sources from libmpmpi since they
# don't contribute any PMPI symbols.
lib_LTLIBRARIES += lib/lib@PMPILIBNAME@.la
lib_lib@PMPILIBNAME@_la_SOURCES = $(mpi_sources) $(mpi_f77_sources) $(mpi_core_sources)
lib_lib@PMPILIBNAME@_la_LDFLAGS = $(external_ldflags) $(ABIVERSIONFLAGS)
lib_lib@PMPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS) -DF77_USE_PMPI
lib_lib@PMPILIBNAME@_la_LIBADD = $(external_libs) $(pmpi_convenience_libs)
lib_lib@PMPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS) -DF77_USE_PMPI -Isrc/mpid/ch4/src/
lib_lib@PMPILIBNAME@_la_LIBADD = $(external_libs) $(pmpi_convenience_libs) $(cuda_objects) ./src/mpid/ch4/src/ch4_cuda_kernel_ops.o ./src/mpid/ch4/src/ch4_gpu.o -lcudart
EXTRA_lib_lib@PMPILIBNAME@_la_DEPENDENCIES = $(pmpi_convenience_libs)

# lib@MPILIBNAME@.la might depend on lib@PMPILIBNAME@.la. We add them
Expand All @@ -162,19 +167,44 @@ EXTRA_lib_lib@PMPILIBNAME@_la_DEPENDENCIES = $(pmpi_convenience_libs)
lib_LTLIBRARIES += lib/lib@MPILIBNAME@.la
lib_lib@MPILIBNAME@_la_SOURCES = $(mpi_sources)
lib_lib@MPILIBNAME@_la_LDFLAGS = $(ABIVERSIONFLAGS)
lib_lib@MPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPICH_MPI_FROM_PMPI
lib_lib@MPILIBNAME@_la_LIBADD = lib/lib@PMPILIBNAME@.la $(mpi_convenience_libs)
lib_lib@MPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPICH_MPI_FROM_PMPI -Isrc/mpid/ch4/src/
lib_lib@MPILIBNAME@_la_LIBADD = lib/lib@PMPILIBNAME@.la $(mpi_convenience_libs) $(cuda_objects) ./src/mpid/ch4/src/ch4_cuda_kernel_ops.o ./src/mpid/ch4/src/ch4_gpu.o -lcudart

else !BUILD_PROFILING_LIB

lib_LTLIBRARIES += lib/lib@MPILIBNAME@.la
lib_lib@MPILIBNAME@_la_SOURCES = $(mpi_sources) $(mpi_core_sources)
lib_lib@MPILIBNAME@_la_LDFLAGS = $(external_ldflags) $(ABIVERSIONFLAGS)
lib_lib@MPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS)
lib_lib@MPILIBNAME@_la_LIBADD = $(external_libs) $(pmpi_convenience_libs) $(mpi_convenience_libs)
lib_lib@MPILIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS) -Isrc/mpid/ch4/src/
lib_lib@MPILIBNAME@_la_LIBADD = $(external_libs) $(pmpi_convenience_libs) $(mpi_convenience_libs) $(cuda_objects) ./src/mpid/ch4/src/ch4_cuda_kernel_ops.o ./src/mpid/ch4/src/ch4_gpu.o -lcudart
EXTRA_lib_lib@MPILIBNAME@_la_DEPENDENCIES = $(pmpi_convenience_libs) $(mpi_convenience_libs)

endif !BUILD_PROFILING_LIB

if USE_CUDA

#lib_LIBRARIES += lib/libmpichcudahelper.a
#lib_libmpichcudahelper_a_SOURCES = $(cuda_sources)
#lib_libmpichcudahelper_a_LDFLAGS = $(external_ldflags) $(ABIVERSIONFLAGS)
#lib_libmpichcudahelper_a_CPPFLAGS = $(AM_CPPFLAGS)
#lib_libmpichcudahelper_a_LIBADD = $(external_libs) $(pmpi_convenience_libs) $(mpi_convenience_libs)
#EXTRA_lib_libmpichcudahelper_a_DEPENDENCIES = $(pmpi_convenience_libs) $(mpi_convenience_libs)
#lib_libmpichcudahelper_a_AR = $(NVCC) -arch=sm_30 -lib -o
#
#.cu.o: src/mpid/ch4/src/ch4_cuda_helper.h src/mpid/ch4/src/ch4_cuda_kernel_ops.h
# $(NVCC) $(AM_CPPFLAGS) $(external_ldflags) -arch=sm_30 -Xcompiler "-fPIC -DPIC" -dc -o $@ $<
#.src/mpid/ch4/src/ch4_gpu.o:
# $(NVCC) -dlink $(cuda_objects) -o src/mpid/ch4/src/ch4_gpu.o

#$(cuda_objects): $(cuda_sources)
# $(NVCC) $(AM_CPPFLAGS) -arch=sm_30 -Xcompiler "-fPIC -DPIC" -dc -o $@ $<

#AM_CPPFLAGS += $(cuda_objects)
#mpi_convenience_libs += $(cuda_objects) -lcudart
#pmpi_convenience_libs += lib/libmpichcudahelper.a

endif USE_CUDA

if BUILD_F77_BINDING
lib_LTLIBRARIES += lib/lib@MPIFCLIBNAME@.la
lib_lib@MPIFCLIBNAME@_la_CPPFLAGS = $(AM_CPPFLAGS)
Expand Down Expand Up @@ -637,11 +667,12 @@ include $(top_srcdir)/doc/Makefile.mk
# -local targets
install-data-local: $(INSTALL_DATA_LOCAL_TARGETS)


# sometimes helpful when debugging macros to see the preprocessed output.
# Also using '-CC' because comments provide useful landmarks

SUFFIXES += .i

.c.i:
$(COMPILE) -CC -E -o $@ $<


37 changes: 37 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -5621,6 +5621,43 @@ if test "X$f08_works" = "Xyes"; then
AC_SUBST(F08_C_COUNT)
AC_SUBST(F08_C_OFFSET)
fi
########################################################################
#For CUDA NVCC
AC_ARG_WITH([cuda],
[AS_HELP_STRING([[--with-cuda[=DIR]]],
[use the CUDA installation in DIR]) or system], [],
[with_cuda=no])

# Detect CUDA
cudaincludedir=""
cudalibdir=""
cudalibs=""

AC_SUBST([cudaincludedir])
AC_SUBST([cudalibs])
AC_SUBST([cudalibdir])

AM_CONDITIONAL([USE_CUDA], [test "x${with_cuda}" != "xno"])
if test "${with_cuda}" != "no" ; then
if test "${with_cuda}" != "system" ; then
AS_IF([test -s "${with_cuda}/include/cuda.h"],
[:],[AC_MSG_ERROR([the CUDA installation in "${with_cuda}" appears broken])])
cudaincludedir="${with_cuda}/include"
cudalibs="-lcuda -lcudart"
if test -d ${with_cuda}/lib64 ; then
cudalibdir="${with_cuda}/lib64"
else
cudalibdir="${with_cuda}/lib"
fi
else
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include "cuda.h"
],
[])],
[AC_MSG_ERROR([the CUDA installation seems to be added from system path])], [])
fi
AC_SUBST([NVCC],["$with_cuda/bin/nvcc"])
AC_DEFINE(USE_CUDA,1,[Define if cuda is available in either user specified path or in system path])
fi

########################################################################
# Some of the settings need to be applied at the end
Expand Down
22 changes: 15 additions & 7 deletions src/include/mpir_op_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#ifndef MPIR_OP_UTIL_H_INCLUDED
#define MPIR_OP_UTIL_H_INCLUDED

#include "ch4_cuda_helper.h"
#include "ch4_cuda_kernel_ops.h"

/* The MPI Standard (MPI-2.1, sec 5.9.2) defines which predfined reduction
operators are valid by groups of types:
C integer
Expand Down Expand Up @@ -45,13 +48,18 @@ MPIR_OP_TYPE_GROUP(C_INTEGER)
* emits a warning and generates invalid arithmetic code. We could drop the
* restrict instead, but we are more likely to get an optimization from it than
* const. [goodell@ 2010-12-15] */
#define MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,op_macro_) \
case (mpi_type_): { \
c_type_ * restrict a = (c_type_ *)inoutvec; \
/*const*/ c_type_ * restrict b = (c_type_ *)invec; \
for (i=0; i<len; i++) \
a[i] = op_macro_(a[i],b[i]); \
break; \
#define MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,type_name_,op_macro_) \
case (mpi_type_): { \
c_type_ * restrict a = (c_type_ *)inoutvec; \
/*const*/ c_type_ * restrict b = (c_type_ *)invec; \
if (is_mem_type_device(inoutvec) && is_mem_type_device(invec)) \
{ \
call_##type_name_##_##op_macro_(a, b, len); \
} else { \
for (i=0; i<len; i++) \
a[i] = op_macro_(a[i],b[i]); \
} \
break; \
}
/* helps enforce consistent naming */
#define MPIR_OP_TYPE_GROUP(group) MPIR_OP_TYPE_GROUP_##group
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opband.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void MPIR_BAND(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LBAND)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LBAND)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opbor.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void MPIR_BOR(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LBOR)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LBOR)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opbxor.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void MPIR_BXOR(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LBXOR)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LBXOR)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opland.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void MPIR_LAND(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LLAND)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LLAND)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)

Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/oplor.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void MPIR_LOR(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LLOR)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LLOR)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)

Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/oplxor.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void MPIR_LXOR(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LLXOR)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LLXOR)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)

Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opmax.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void MPIR_MAXF(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPL_MAX)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPL_MAX)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand Down
2 changes: 1 addition & 1 deletion src/mpi/coll/op/opmin.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void MPIR_MINF(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPL_MIN)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPL_MIN)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand Down
6 changes: 4 additions & 2 deletions src/mpi/coll/op/opprod.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void MPIR_PROD(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LPROD)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LPROD)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand All @@ -45,9 +45,11 @@ void MPIR_PROD(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)
break; \
}
#undef MPIR_OP_C_COMPLEX_TYPE_MACRO
#define MPIR_OP_C_COMPLEX_TYPE_MACRO(mpi_type_,c_type_,type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,MPIR_LPROD)
#define MPIR_OP_C_COMPLEX_TYPE_MACRO(mpi_type_,c_type_,type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,type_name_,MPIR_LPROD)
#ifndef USE_CUDA
MPIR_OP_TYPE_GROUP(COMPLEX)
MPIR_OP_TYPE_GROUP(COMPLEX_EXTRA)
#endif
/* put things back where we found them */
#undef MPIR_OP_TYPE_MACRO
#undef MPIR_OP_C_COMPLEX_TYPE_MACRO
Expand Down
6 changes: 4 additions & 2 deletions src/mpi/coll/op/opsum.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void MPIR_SUM(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)

switch (*type) {
#undef MPIR_OP_TYPE_MACRO
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, MPIR_LSUM)
#define MPIR_OP_TYPE_MACRO(mpi_type_, c_type_, type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_, c_type_, type_name_, MPIR_LSUM)
/* no semicolons by necessity */
MPIR_OP_TYPE_GROUP(C_INTEGER)
MPIR_OP_TYPE_GROUP(FORTRAN_INTEGER)
Expand All @@ -44,9 +44,11 @@ void MPIR_SUM(void *invec, void *inoutvec, int *Len, MPI_Datatype * type)
}
/* C complex types are just simple sums */
#undef MPIR_OP_C_COMPLEX_TYPE_MACRO
#define MPIR_OP_C_COMPLEX_TYPE_MACRO(mpi_type_,c_type_,type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,MPIR_LSUM)
#define MPIR_OP_C_COMPLEX_TYPE_MACRO(mpi_type_,c_type_,type_name_) MPIR_OP_TYPE_REDUCE_CASE(mpi_type_,c_type_,type_name_,MPIR_LSUM)
#ifndef USE_CUDA
MPIR_OP_TYPE_GROUP(COMPLEX)
MPIR_OP_TYPE_GROUP(COMPLEX_EXTRA)
#endif
/* put things back where we found them */
#undef MPIR_OP_TYPE_MACRO
#undef MPIR_OP_C_COMPLEX_TYPE_MACRO
Expand Down
49 changes: 49 additions & 0 deletions src/mpid/ch4/netmod/ucx/ucx_am.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#define UCX_AM_H_INCLUDED

#include "ucx_impl.h"
#include "ch4_cuda_helper.h"

//const int is_mem_type_device (const void *address);

MPL_STATIC_INLINE_PREFIX void MPIDI_UCX_am_isend_callback(void *request, ucs_status_t status)
{
Expand Down Expand Up @@ -75,6 +78,52 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_am_isend(int rank,
MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_NM_AM_ISEND);

MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);

// Copy data from device to host
if (is_mem_type_device(data) == 1) {
void *temp_buffer;
cudaError_t status;
#ifdef PROFILE_CUDA
int iter = 1000;
MPL_wtime_init();
MPL_time_t malloc_start_time, malloc_end_time;
double time_diff;
MPL_wtime(&malloc_start_time);
for (int i = 0; i < iter; i++) {
#endif
temp_buffer = malloc(data_sz);
#ifdef PROFILE_CUDA
}
MPL_wtime(&malloc_end_time);
MPL_wtime_diff(&malloc_start_time, &malloc_end_time, &time_diff);
printf("Malloc: %f us \t | \t", (time_diff * 1e6) / iter);

cudaEvent_t start_event, stop_event;
float time;

cuda_func(cudaEventCreate(&start_event));
cuda_func(cudaEventCreate(&stop_event));

cuda_func(cudaEventRecord(start_event, 0));
for (int i = 0; i < iter; i++) {
#endif
status = cudaMemcpy(temp_buffer, data, data_sz, cudaMemcpyDeviceToHost);
if (status != cudaSuccess) {
printf("FAILURE... Status: %s\n", cudaGetErrorString(status));
fflush(stdout);
goto fn_exit;
}
#ifdef PROFILE_CUDA
}
cuda_func(cudaEventRecord(stop_event, 0));
cuda_func(cudaEventSynchronize(stop_event));

cuda_func(cudaEventElapsedTime(&time, start_event, stop_event));
printf("D-H Memcpy: %f us \t | \t", (time * 1e3) / iter);
#endif
data = temp_buffer;
}

if (handler_id == MPIDIG_SEND &&
am_hdr_sz + sizeof(MPIDI_UCX_am_header_t) + data_sz > MPIDI_UCX_MAX_AM_EAGER_SZ) {
MPIDIG_send_long_req_mst_t lreq_hdr;
Expand Down
24 changes: 22 additions & 2 deletions src/mpid/ch4/src/Makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ noinst_HEADERS += src/mpid/ch4/src/ch4_comm.h \
src/mpid/ch4/src/ch4r_callbacks.h \
src/mpid/ch4/src/ch4r_rma_origin_callbacks.h \
src/mpid/ch4/src/ch4r_rma_target_callbacks.h \
src/mpid/ch4/src/ch4r_request.h
src/mpid/ch4/src/ch4r_request.h\
src/mpid/ch4/src/ch4_cuda_helper.h\
src/mpid/ch4/src/ch4_cuda_kernel_ops.h

mpi_core_sources += src/mpid/ch4/src/ch4_globals.c \
src/mpid/ch4/src/ch4_impl.c \
Expand All @@ -56,7 +58,25 @@ mpi_core_sources += src/mpid/ch4/src/ch4_globals.c \
src/mpid/ch4/src/ch4r_rma_target_callbacks.c \
src/mpid/ch4/src/ch4r_symheap.c \
src/mpid/ch4/src/ch4r_win.c \
src/mpid/ch4/src/mpid_ch4_net_array.c
src/mpid/ch4/src/mpid_ch4_net_array.c\
src/mpid/ch4/src/ch4_cuda_helper.c

if USE_CUDA

cuda_sources += src/mpid/ch4/src/ch4_cuda_helper.h\
src/mpid/ch4/src/ch4_cuda_helper.cu\
src/mpid/ch4/src/ch4_cuda_kernel_ops.h\
src/mpid/ch4/src/ch4_cuda_kernel_ops.cu

cuda_objects += src/mpid/ch4/src/ch4_cuda_kernel_ops.o\
src/mpid/ch4/src/ch4_gpu.o
else

mpi_core_sources += src/mpid/ch4/src/ch4_cuda_helper.h\
src/mpid/ch4/src/ch4_cuda_helper.cu\
src/mpid/ch4/src/ch4_cuda_kernel_ops.h\
src/mpid/ch4/src/ch4_cuda_kernel_ops.cu
endif

if BUILD_CH4_COLL_TUNING
mpi_core_sources += src/mpid/ch4/src/ch4_coll_globals.c
Expand Down