diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu index 38a3ba3f5368..74e364ae7b4a 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu @@ -640,7 +640,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduce( + reduce( RefTuple out, const ConstRefTuple& inp, VolatilePtrTuple global_work_buffer, @@ -1056,7 +1056,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduce( + reduce( RefTuple out, const ConstRefTuple& inp, VolatilePtrTuple global_work_buffer, @@ -1113,7 +1113,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduceGroup( + reduceGroup( RefTuple out, const ConstRefTuple& inp, VolatilePtrTuple global_work_buffer, @@ -1298,7 +1298,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduceGroup( + reduceGroup( RefTuple out, const ConstRefTuple& inp, VolatilePtrTuple global_work_buffer, @@ -1358,7 +1358,7 @@ __device__ __inline__ LocalTuple ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduceGroupBlock( + reduceGroupBlock( const ConstRefTuple& inp, const LocalTuple& init_val, void* shared_mem, @@ -1434,7 +1434,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - reduceGroupLastBlock( + reduceGroupLastBlock( RefTuple& out, const VolatilePtrTuple& global_work_buffer, const LocalTuple& init_val, diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu index 8dd9bab51621..8603087e8453 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu @@ -265,7 +265,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - welfordGroup( + welfordGroup( typename MakeRefTuple::type out_avg, typename MakeRefTuple::type out_var, typename MakeRefTuple::type out_N, @@ -465,7 +465,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - welfordGroupBlock( + welfordGroupBlock( LocalWelfordTripletTuple& block_result, const ConstRefWelfordTripletTuple& inp, PtrTuple shared_buf, @@ -534,7 +534,7 @@ __device__ __inline__ void ParallelReduce< Z_THREAD, PERSISTENT_REDUCTION, BROADCAST>:: - welfordGroupLastBlock( + welfordGroupLastBlock( RefWelfordTripletTuple& out, const VolatilePtrWelfordTripletTuple& global_work_buffer,