Enable correct supported activities for kineto on rocm #88207

mwootton · 2022-11-01T16:21:30Z

A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table().

cc @jeffdaily @sunway513 @jithunnair-amd @pruthvistony @ROCmSupport

pytorch-bot · 2022-11-01T16:21:33Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/88207

📄 Preview Python docs built from this PR
📄 Preview C++ docs built from this PR
❓ Need help or want to give feedback on the CI? Visit our office hours

Note: Links to docs will display an error until the docs builds have been completed.

❌ 2 Failures

As of commit 01cd18c:

The following jobs have failed:

This comment was automatically generated by Dr. CI and updates every 15 minutes.

linux-foundation-easycla · 2022-11-01T16:21:33Z

The committers listed above are authorized under a signed CLA.

✅ login: mwootton (3705aeb)

malfet

LGTM

malfet

Hmm, I think according to this logic if code is build with GPU support, either NOROCTRACER or NOCUPTI would be deinfed:

pytorch/cmake/Dependencies.cmake

Lines 1908 to 2000 in 6e6f929

    
           if(USE_KINETO) 
        
             if((NOT USE_CUDA) OR MSVC) 
        
               set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE) 
        
             else() 
        
               set(LIBKINETO_NOCUPTI OFF CACHE STRING "") 
        
               message(STATUS "Using Kineto with CUPTI support") 
        
             endif() 
        
             if(NOT USE_ROCM) 
        
               set(LIBKINETO_NOROCTRACER ON CACHE STRING "" FORCE) 
        
             else() 
        
               set(LIBKINETO_NOROCTRACER OFF CACHE STRING "") 
        
               message(STATUS "Using Kineto with Roctracer support") 
        
             endif() 
        
             if(LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER) 
        
               message(STATUS "Using CPU-only version of Kineto") 
        
             endif() 
        
             set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party" CACHE STRING "") 
        
             set(KINETO_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/kineto/libkineto" CACHE STRING "") 
        
             set(KINETO_BUILD_TESTS OFF CACHE BOOL "") 
        
             set(KINETO_LIBRARY_TYPE "static" CACHE STRING "") 
        
             message(STATUS "Configuring Kineto dependency:") 
        
             message(STATUS "  KINETO_SOURCE_DIR = ${KINETO_SOURCE_DIR}") 
        
             message(STATUS "  KINETO_BUILD_TESTS = ${KINETO_BUILD_TESTS}") 
        
             message(STATUS "  KINETO_LIBRARY_TYPE = ${KINETO_LIBRARY_TYPE}") 
        
             if(NOT LIBKINETO_NOCUPTI) 
        
               set(CUDA_SOURCE_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE STRING "") 
        
               message(STATUS "  CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}") 
        
               message(STATUS "  CUDA_INCLUDE_DIRS = ${CUDA_INCLUDE_DIRS}") 
        
               if(NOT MSVC) 
        
                 if(USE_CUPTI_SO) 
        
                   set(CUPTI_LIB_NAME "libcupti.so") 
        
                 else() 
        
                   set(CUPTI_LIB_NAME "libcupti_static.a") 
        
                 endif() 
        
               else() 
        
                 set(CUPTI_LIB_NAME "cupti.lib") 
        
               endif() 
        
               find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS 
        
                   ${CUDA_SOURCE_DIR} 
        
                   ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64 
        
                   ${CUDA_SOURCE_DIR}/lib64 
        
                   NO_DEFAULT_PATH) 
        
               find_path(CUPTI_INCLUDE_DIR cupti.h PATHS 
        
                   ${CUDA_SOURCE_DIR}/extras/CUPTI/include 
        
                   ${CUDA_INCLUDE_DIRS} 
        
                   ${CUDA_SOURCE_DIR} 
        
                   ${CUDA_SOURCE_DIR}/include 
        
                   NO_DEFAULT_PATH) 
        
               if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR) 
        
                 message(STATUS "  CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") 
        
                 set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH}) 
        
                 message(STATUS "  CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}") 
        
                 message(STATUS "Found CUPTI") 
        
                 set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE) 
        
                 # I've only tested this sanity check on Linux; if someone 
        
                 # runs into this bug on another platform feel free to 
        
                 # generalize it accordingly 
        
                 if(NOT USE_CUPTI_SO AND UNIX) 
        
                   include(CheckCXXSourceRuns) 
        
                   # rt is handled by the CMAKE_REQUIRED_LIBRARIES set above 
        
                   if(NOT APPLE) 
        
                     set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl") 
        
                   endif() 
        
                   set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive") 
        
                   check_cxx_source_runs("#include <stdexcept> 
        
             int main() { 
        
               try { 
        
                 throw std::runtime_error(\"error\"); 
        
               } catch (...) { 
        
                 return 0; 
        
               } 
        
               return 1; 
        
             }" EXCEPTIONS_WORK) 
        
                   set(CMAKE_REQUIRED_LINK_OPTIONS "") 
        
                   if(NOT EXCEPTIONS_WORK) 
        
                     message(FATAL_ERROR "Detected that statically linking against CUPTI causes exceptions to stop working.  See https://github.com/pytorch/pytorch/issues/57744 for more details.  Perhaps try: USE_CUPTI_SO=1 python setup.py develop --cmake") 
        
                   endif() 
        
                 endif() 
        
               else() 
        
                 message(STATUS "Could not find CUPTI library, using CPU-only Kineto build") 
        
                 set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE) 
        
               endif()

mwootton · 2022-11-10T20:34:11Z

Hmm, I think according to this logic if code is build with GPU support, either NOROCTRACER or NOCUPTI would be deinfed:

Correct:
for cuda: USE_KINETO and LIBKINETO_NOROCTRACER will be set
for rocm: USE_KINETO and LIBKINETO_NOCUPTI will be set
for CPU only or a configuration error (e.g. no cupti headers detected): USE_KINETO, LIBKINETO_NOROCTRACER, and LIBKINETO_NOCUPTI will all be set

We need to enable ActivityType::CUDA (which means GPU in this case) whenever direct kernel measurements are available, i.e. cupti or roctracer are present. So we need that activity enabled unless BOTH are missing.

An observation: It would probably be better to query the compiled libkineto at runtime for its actual capabilities than guessing it's capabilities based on CMAKE variables we passed to its build. Also, the negative define variables seem like a bad choice as you immediately end up with double negatives. LIBKINETO_USE_CUPTI and !LIBKINETO_USE_CUPTI speak to me more clearly. Also, you then wouldn't need to use DeMorgan's law make sense of, e.g., this patch. Which may have been done on purpose. :)

malfet

It looks very confusing, but if CI is green (assuming we are testing kineto in CI), looks good to me.

mwootton · 2022-11-11T16:49:35Z

#if defined(USE_KINETO) && \
    (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
    if (at::getNumGPUs() > 0) {
      activities.insert(ActivityType::CUDA);

So the second line is the same as:
!(defined(LIBKINETO_NOCUPTI) && defined(LIBKINETO_NOROCTRACER))

Overall, if:

We are using kineto
We don't have both profiling libraries disabled
We have a gpu present

...then we want to use the gpu profiling library.

And no argument that it is confusing. I was just dealing with what was there to begin with.

jeffdaily · 2022-11-11T16:56:58Z

@pytorchbot merge

pytorchmergebot · 2022-11-11T17:03:01Z

Merge started

Your change will be merged once all checks pass (ETA 0-4 Hours).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging

Check the merge workflow status
here

pytorchmergebot · 2022-11-11T17:03:02Z

Merge failed

Reason: This PR is too stale; the last push date was more than 3 days ago. Please rebase and try again. You can rebase by leaving the following comment on this PR:
@pytorchbot rebase

Details for Dev Infra team

Raised by workflow job

mwootton · 2022-11-12T00:14:13Z

@pytorchbot rebase

pytorch-bot · 2022-11-12T00:14:16Z

You don't have permissions to rebase this PR, only people with write permissions may rebase PRs.

jithunnair-amd · 2022-11-15T21:30:38Z

@mwootton Can you please double-check this CI failure: https://github.com/pytorch/pytorch/actions/runs/3463699947/jobs/5784908770

2022-11-14T18:28:24.1445685Z     test_kineto failed - num_retries_left: 1
2022-11-14T18:28:24.1446220Z Traceback (most recent call last):
2022-11-14T18:28:24.1447242Z   File "C:\actions-runner\_work\pytorch\pytorch\test\profiler\test_profiler.py", line 581, in test_kineto
2022-11-14T18:28:24.1448082Z     self.assertTrue(found_gemm)
2022-11-14T18:28:24.1448732Z AssertionError: False is not true
2022-11-14T18:28:24.1449038Z 
2022-11-14T18:28:24.1449196Z FAIL (0.012s)

malfet · 2022-11-15T21:34:38Z

@pytorchbot merge -f "ROCM tests are green"

pytorchmergebot · 2022-11-15T21:40:39Z

Merge started

Your change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging

Check the merge workflow status
here

kit1980 · 2022-11-16T00:43:57Z

@pytorchbot revert -m "Broke test_kineto on trunk / win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)" -c ignoredsignal

pytorchmergebot · 2022-11-16T00:45:37Z

@pytorchbot successfully started a revert job. Check the current status here.
Questions? Feedback? Please reach out to the PyTorch DevX Team

pytorchmergebot · 2022-11-16T00:45:45Z

@mwootton your PR has been successfully reverted.

This reverts commit 35093fc. Reverted #88207 on behalf of https://github.com/kit1980 due to Broke test_kineto on trunk / win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)

A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table(). Pull Request resolved: pytorch#88207 Approved by: https://github.com/malfet, https://github.com/jeffdaily

* Update to kineto with fixes * Enable correct supported activities for kineto on rocm (pytorch#88207) A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table(). Pull Request resolved: pytorch#88207 Approved by: https://github.com/malfet, https://github.com/jeffdaily Co-authored-by: Michael Wootton <michael.wootton@amd.com>

Continuation of #88207 A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table(). Previously a cmake variable was not being propagated to a '-D', causing an issue on Windows, which uses cuda but not cupti. Pull Request resolved: #89785 Approved by: https://github.com/jeffdaily, https://github.com/malfet

A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table(). Pull Request resolved: pytorch#88207 Approved by: https://github.com/malfet, https://github.com/jeffdaily

…ch#88207)" This reverts commit 35093fc. Reverted pytorch#88207 on behalf of https://github.com/kit1980 due to Broke test_kineto on trunk / win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)

Continuation of pytorch#88207 A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table(). Previously a cmake variable was not being propagated to a '-D', causing an issue on Windows, which uses cuda but not cupti. Pull Request resolved: pytorch#89785 Approved by: https://github.com/jeffdaily, https://github.com/malfet

…ineto (#92889) PR #88207 enabled ActivityType::CUDA for ROCm. TestProfiler.test_kineto needs an update in the test code to look for the correct pattern for gemm kernels for ROCm. Pull Request resolved: #92889 Approved by: https://github.com/malfet

Enable correct supported activities for kineto on rocm

3705aeb

mwootton requested review from albanD and soulitzer as code owners November 1, 2022 16:21

pytorch-bot bot added the module: rocm AMD GPU support for Pytorch label Nov 1, 2022

pytorchbot added the open source label Nov 1, 2022

Wrap a line

493c717

albanD requested review from robieta and removed request for albanD November 1, 2022 17:55

H-Huang added the triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module label Nov 2, 2022

indent line wrap

c800d3a

malfet approved these changes Nov 8, 2022

View reviewed changes

malfet requested changes Nov 8, 2022

View reviewed changes

malfet approved these changes Nov 10, 2022

View reviewed changes

jeffdaily approved these changes Nov 11, 2022

View reviewed changes

pytorch-bot bot added the ciflow/trunk Trigger trunk jobs on your pull request label Nov 11, 2022

mwootton added 2 commits November 14, 2022 11:18

Merge branch 'pytorch:master' into master

df46ee0

Merge branch 'master' into kineto_activity_fix

01cd18c

pytorchmergebot added the Merged label Nov 15, 2022

pytorchmergebot closed this in 35093fc Nov 15, 2022

pytorchmergebot added the Reverted label Nov 16, 2022

mwootton mentioned this pull request Nov 28, 2022

Kineto activity fix #89785

Closed

jithunnair-amd mentioned this pull request Jan 24, 2023

[ROCm] Also look for 'Cijk' (rocblas kernel) to verify gemm in test_kineto #92889

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enable correct supported activities for kineto on rocm #88207

Enable correct supported activities for kineto on rocm #88207

mwootton commented Nov 1, 2022 •

edited by pytorch-bot bot

pytorch-bot bot commented Nov 1, 2022 •

edited

linux-foundation-easycla bot commented Nov 1, 2022 •

edited

malfet left a comment

malfet left a comment

mwootton commented Nov 10, 2022

malfet left a comment

mwootton commented Nov 11, 2022

jeffdaily commented Nov 11, 2022

pytorchmergebot commented Nov 11, 2022

pytorchmergebot commented Nov 11, 2022

mwootton commented Nov 12, 2022

pytorch-bot bot commented Nov 12, 2022

jithunnair-amd commented Nov 15, 2022

malfet commented Nov 15, 2022

pytorchmergebot commented Nov 15, 2022

kit1980 commented Nov 16, 2022

pytorchmergebot commented Nov 16, 2022

pytorchmergebot commented Nov 16, 2022

	if(USE_KINETO)
	if((NOT USE_CUDA) OR MSVC)
	set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
	else()
	set(LIBKINETO_NOCUPTI OFF CACHE STRING "")
	message(STATUS "Using Kineto with CUPTI support")
	endif()

	if(NOT USE_ROCM)
	set(LIBKINETO_NOROCTRACER ON CACHE STRING "" FORCE)
	else()
	set(LIBKINETO_NOROCTRACER OFF CACHE STRING "")
	message(STATUS "Using Kineto with Roctracer support")
	endif()

	if(LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER)
	message(STATUS "Using CPU-only version of Kineto")
	endif()

	set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party" CACHE STRING "")
	set(KINETO_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/kineto/libkineto" CACHE STRING "")
	set(KINETO_BUILD_TESTS OFF CACHE BOOL "")
	set(KINETO_LIBRARY_TYPE "static" CACHE STRING "")

	message(STATUS "Configuring Kineto dependency:")
	message(STATUS " KINETO_SOURCE_DIR = ${KINETO_SOURCE_DIR}")
	message(STATUS " KINETO_BUILD_TESTS = ${KINETO_BUILD_TESTS}")
	message(STATUS " KINETO_LIBRARY_TYPE = ${KINETO_LIBRARY_TYPE}")

	if(NOT LIBKINETO_NOCUPTI)
	set(CUDA_SOURCE_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE STRING "")
	message(STATUS " CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}")
	message(STATUS " CUDA_INCLUDE_DIRS = ${CUDA_INCLUDE_DIRS}")

	if(NOT MSVC)
	if(USE_CUPTI_SO)
	set(CUPTI_LIB_NAME "libcupti.so")
	else()
	set(CUPTI_LIB_NAME "libcupti_static.a")
	endif()
	else()
	set(CUPTI_LIB_NAME "cupti.lib")
	endif()

	find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
	${CUDA_SOURCE_DIR}
	${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
	${CUDA_SOURCE_DIR}/lib64
	NO_DEFAULT_PATH)

	find_path(CUPTI_INCLUDE_DIR cupti.h PATHS
	${CUDA_SOURCE_DIR}/extras/CUPTI/include
	${CUDA_INCLUDE_DIRS}
	${CUDA_SOURCE_DIR}
	${CUDA_SOURCE_DIR}/include
	NO_DEFAULT_PATH)

	if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
	message(STATUS " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
	set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH})
	message(STATUS " CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
	message(STATUS "Found CUPTI")
	set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE)

	# I've only tested this sanity check on Linux; if someone
	# runs into this bug on another platform feel free to
	# generalize it accordingly
	if(NOT USE_CUPTI_SO AND UNIX)
	include(CheckCXXSourceRuns)
	# rt is handled by the CMAKE_REQUIRED_LIBRARIES set above
	if(NOT APPLE)
	set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl")
	endif()
	set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive")
	check_cxx_source_runs("#include <stdexcept>
	int main() {
	try {
	throw std::runtime_error(\"error\");
	} catch (...) {
	return 0;
	}
	return 1;
	}" EXCEPTIONS_WORK)
	set(CMAKE_REQUIRED_LINK_OPTIONS "")
	if(NOT EXCEPTIONS_WORK)
	message(FATAL_ERROR "Detected that statically linking against CUPTI causes exceptions to stop working. See https://github.com/pytorch/pytorch/issues/57744 for more details. Perhaps try: USE_CUPTI_SO=1 python setup.py develop --cmake")
	endif()
	endif()

	else()
	message(STATUS "Could not find CUPTI library, using CPU-only Kineto build")
	set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
	endif()

Enable correct supported activities for kineto on rocm #88207

Enable correct supported activities for kineto on rocm #88207

Conversation

mwootton commented Nov 1, 2022 • edited by pytorch-bot bot

pytorch-bot bot commented Nov 1, 2022 • edited

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/88207

❌ 2 Failures

linux-foundation-easycla bot commented Nov 1, 2022 • edited

malfet left a comment

Choose a reason for hiding this comment

malfet left a comment

Choose a reason for hiding this comment

mwootton commented Nov 10, 2022

malfet left a comment

Choose a reason for hiding this comment

mwootton commented Nov 11, 2022

jeffdaily commented Nov 11, 2022

pytorchmergebot commented Nov 11, 2022

Merge started

pytorchmergebot commented Nov 11, 2022

Merge failed

mwootton commented Nov 12, 2022

pytorch-bot bot commented Nov 12, 2022

jithunnair-amd commented Nov 15, 2022

malfet commented Nov 15, 2022

pytorchmergebot commented Nov 15, 2022

Merge started

kit1980 commented Nov 16, 2022

pytorchmergebot commented Nov 16, 2022

pytorchmergebot commented Nov 16, 2022

mwootton commented Nov 1, 2022 •

edited by pytorch-bot bot

pytorch-bot bot commented Nov 1, 2022 •

edited

linux-foundation-easycla bot commented Nov 1, 2022 •

edited