diff --git a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu index 97a2742691..9906f262f9 100644 --- a/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu +++ b/src/libtorchaudio/cuctc/src/ctc_prefix_decoder_kernel_v2.cu @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "../include/ctc_prefix_decoder_host.h" #include "ctc_fast_divmod.cuh" #include "cub/cub.cuh" @@ -440,7 +441,11 @@ __launch_bounds__(BLOCK_SIZE) void topk_reduce_and_copy_list_per_batch_kernel( topk_values, beam, items_per_batch, +#if CUDART_VERSION >= 12090 // CUDA 12.9 and later + std::numeric_limits::lowest(), +#else cub::FpLimits::Lowest(), +#endif block_topk_fun, set_key_value); diff --git a/src/libtorchaudio/forced_align/gpu/compute.cu b/src/libtorchaudio/forced_align/gpu/compute.cu index ef7d9acaae..a78f694b51 100644 --- a/src/libtorchaudio/forced_align/gpu/compute.cu +++ b/src/libtorchaudio/forced_align/gpu/compute.cu @@ -94,7 +94,11 @@ __global__ void falign_cuda_step_kernel( alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; threadMax = max(threadMax, alphas_a[curIdxOffset][i]); } +#if CUDART_VERSION >= 12090 // CUDA 12.9 and later + scalar_t maxResult = BlockReduce(tempStorage).Reduce(threadMax, thrust::maximum()); +#else scalar_t maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max()); +#endif if (threadIdx.x == 0) { maxValue = maxResult; }