pytorch · ezyang · Dec 12, 2019 · Dec 12, 2019 · Dec 12, 2019 · Dec 12, 2019
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -76,22 +76,39 @@ fi
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     export ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true
-    # We suppress the vptr volation, since we have separate copies of
-    # libprotobuf in both libtorch.so and libcaffe2.so, and it causes
-    # the following problem:
-    #    test_cse (__main__.TestJit) ... torch/csrc/jit/export.cpp:622:38:
-    #        runtime error: member call on address ... which does not point
-    #        to an object of type 'google::protobuf::MessageLite'
-    #        ...: note: object is of type 'onnx_torch::ModelProto'
-    #
-    # This problem should be solved when libtorch.so and libcaffe2.so are
-    # merged.
-    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
+    export UBSAN_OPTIONS=print_stacktrace=1
     export PYTORCH_TEST_WITH_ASAN=1
     export PYTORCH_TEST_WITH_UBSAN=1
     # TODO: Figure out how to avoid hard-coding these paths
     export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-5.0/bin/llvm-symbolizer
-    export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so
+    # NB: we preload libtorch.so to ensure that subsequent loads of C++
+    # extension modules consistently reference the type info in
+    # libtorch.so and its dependencies (most notably, libc++.so).
+    # If another copy of type info is generated, then
+    # UBSAN will fail claiming a vptr violation that looks like:
+    #
+    #    member call on address XXXXXX which does not point to an object of
+    #    type 'std::_Sp_counted_base<__gnu_cxx::_Lock_policy::_S_atomic>'
+    #    XXXXXX note: object is of type
+    #    'std::_Sp_counted_ptr<torch::nn::LinearImpl*, (__gnu_cxx::_Lock_policy)2>'
+    #
+    # (NB: the textual types of the objects here are misleading, because
+    # they actually line up; it just so happens that there's two copies
+    # of the type info floating around in the address space, so they
+    # don't pointer compare equal.  See also
+    #   https://github.com/google/sanitizers/issues/1175
+    #
+    # This didn't use to be necessary, because historically we loaded
+    # _C.so (and transitively, libtorch.so) using RTLD_GLOBAL. We
+    # stopped doing that to promote better hygiene of C++ symbols,
+    # but that means all weak symbols are going to get duplicated--this
+    # especially applies to type info, which is almost always weak.  This
+    # has implications for RTTI (which UBSAN is rightly flagging won't
+    # work), but in our codebase, we don't use RTTI (because it doesn't
+    # work in mobile).  However, UBSAN relies on UBSAN to detect vptr
+    # confusion, so at least in this environment, we need our ducks in
+    # order!
+    export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so:$PWD/torch/lib/libtorch_python.so
     # Increase stack size, because ASAN red zones use more stack
     ulimit -s 81920
 

diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -390,3 +390,26 @@ if (NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
 endif()
 
 install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+# Note [Global dependencies]
+# Some libraries (e.g. OpenMPI) like to dlopen plugins after they're initialized,
+# and they assume that all of their symbols will be available in the global namespace.
+# On the other hand we try to be good citizens and avoid polluting the symbol
+# namespaces, so libtorch is loaded with all its dependencies in a local scope.
+# That usually leads to missing symbol errors at run-time, so to avoid a situation like
+# this we have to preload those libs in a global namespace.
+add_library(torch_global_deps SHARED ${TORCH_SRC_DIR}/csrc/empty.c)
+set_target_properties(torch_global_deps PROPERTIES LINKER_LANGUAGE C)
+if (USE_MPI)
+    target_link_libraries(torch_global_deps ${MPI_CXX_LIBRARIES})
+endif()
+target_link_libraries(torch_global_deps ${MKL_LIBRARIES})
+# The CUDA libraries are linked here for a different reason: in some
+# cases we load these libraries with ctypes, and if they weren't opened
+# with RTLD_GLOBAL, we'll do the "normal" search process again (and
+# not find them, because they're usually in non-standard locations)
+if (USE_CUDA)
+  target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+endif()
+
+install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
diff --git a/torch/__init__.py b/torch/__init__.py
@@ -13,6 +13,7 @@
 import os
 import sys
 import platform
+import ctypes
 from ._utils import _import_dotted_name
 from ._utils_internal import get_file_path, prepare_multiprocessing_environment
 from .version import __version__
@@ -33,61 +34,44 @@
 # Load the extension module
 ################################################################################
 
-# Loading the extension with RTLD_GLOBAL option allows to not link extension
-# modules against the _C shared object. Their missing THP symbols will be
-# automatically filled by the dynamic loader.
-import os as _dl_flags
-
-# if we have numpy, it *must* be imported before the call to setdlopenflags()
-# or there is risk that later c modules will segfault when importing numpy
-try:
-    import numpy as _np
-except ImportError:
-    pass
-
 if platform.system() == 'Windows':
-    # first get nvToolsExt PATH
-    def get_nvToolsExt_path():
-        NVTOOLEXT_HOME = _dl_flags.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
+    NVTOOLSEXT_PATH = os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt')
 
-        if _dl_flags.path.exists(NVTOOLEXT_HOME):
-            return _dl_flags.path.join(NVTOOLEXT_HOME, 'bin', 'x64')
-        else:
-            return ''
+    if os.path.exists(NVTOOLSEXT_PATH):
+        nvtoolsext_lib_path = os.path.join(NVTOOLSEXT_PATH, 'bin', 'x64')
+    else:
+        nvtoolsext_lib_path = ''
 
-    py_dll_path = _dl_flags.path.join(sys.exec_prefix, 'Library', 'bin')
-    th_dll_path = _dl_flags.path.join(_dl_flags.path.dirname(__file__), 'lib')
+    py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin')
+    th_dll_path = os.path.join(os.path.dirname(__file__), 'lib')
 
-    dll_paths = [th_dll_path, py_dll_path, get_nvToolsExt_path(), _dl_flags.environ['PATH']]
+    dll_paths = [th_dll_path, py_dll_path, nvtoolsext_lib_path, os.environ['PATH']]
 
     # then add the path to env
-    _dl_flags.environ['PATH'] = ';'.join(dll_paths)
+    os.environ['PATH'] = ';'.join(dll_paths)
 
-else:
-    # first check if the os package has the required flags
-    if not hasattr(_dl_flags, 'RTLD_GLOBAL') or not hasattr(_dl_flags, 'RTLD_LAZY'):
-        try:
-            # next try if DLFCN exists
-            import DLFCN as _dl_flags
-        except ImportError:
-            # as a last attempt, use compile-time constants
-            import torch._dl as _dl_flags
 
-    old_flags = sys.getdlopenflags()
-    sys.setdlopenflags(_dl_flags.RTLD_GLOBAL | _dl_flags.RTLD_LAZY)
+# See Note [Global dependencies]
+def _load_global_deps():
+    if platform.system() == 'Windows':
+        return
 
-del _dl_flags
+    lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
+    here = os.path.abspath(__file__)
+    lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name)
+
+    ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+
+
+# See Note [Global dependencies]
+_load_global_deps()
 
 from torch._C import *
 
 __all__ += [name for name in dir(_C)
             if name[0] != '_' and
             not name.endswith('Base')]
 
-if platform.system() != 'Windows':
-    sys.setdlopenflags(old_flags)
-    del old_flags
-
 ################################################################################
 # Define basic utilities
 ################################################################################

diff --git a/torch/csrc/empty.c b/torch/csrc/empty.c
diff --git a/ubsan.supp b/ubsan.supp