diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py
index 306f616d3ef7..235c08d62786 100644
--- a/.circleci/cimodel/data/simple/ge_config_tests.py
+++ b/.circleci/cimodel/data/simple/ge_config_tests.py
@@ -61,25 +61,25 @@ def gen_tree(self):
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_legacy", "test"],
+        ["jit_legacy", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
     GeConfigTestJob(
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_simple", "test"],
+        ["jit_simple", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"],
     ),
     GeConfigTestJob(
         None,
         None,
         CudaVersion(10, 2),
-        ["cudnn7", "py3", "ge_config_legacy", "test"],
+        ["cudnn7", "py3", "jit_legacy", "test"],
         ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
         use_cuda_docker=True,
         # TODO Why does the build environment specify cuda10.1, while the
         # job name is cuda10_2?
-        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"),
+        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 06a7188b7c13..208e0d09eed0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7023,23 +7023,23 @@ workflows:
           requires:
             - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_simple-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_simple_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_jit_legacy_test
           requires:
             - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           resource_class: gpu.medium
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 213750ba7280..8e71738f414e 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -63,7 +63,7 @@ test_python_all() {
   # Increase default limit on open file handles from 256 to 1024
   ulimit -n 1024
 
-  python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM"
+  python test/run_test.py --verbose --exclude-jit-executor --determine-from="$DETERMINE_FROM"
 
   assert_git_not_dirty
 }
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 48a0063bbda6..0e35364a2f5d 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -126,23 +126,18 @@ if ([ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]);
   file_diff_from_base "$DETERMINE_FROM"
 fi
 
-test_python_nn() {
-  time python test/run_test.py --include test_nn --verbose --determine-from="$DETERMINE_FROM"
-  assert_git_not_dirty
-}
-
-test_python_ge_config_profiling() {
-  time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+test_python_legacy_jit() {
+  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_ge_config_legacy() {
-  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard1() {
+  time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_all_except_nn_and_cpp_extensions() {
-  time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard2() {
+  time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -304,7 +299,7 @@ test_xla() {
   assert_git_not_dirty
 }
 
-# Do NOT run this test before any other tests, like test_python_nn, etc.
+# Do NOT run this test before any other tests, like test_python_shard1, etc.
 # Because this function uninstalls the torch built from branch, and install
 # nightly version.
 test_backward_compatibility() {
@@ -381,19 +376,17 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   install_torchvision
   test_xla
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
-  test_python_ge_config_legacy
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then
-  test_python_ge_config_profiling
+elif [[ "${BUILD_ENVIRONMENT}" == *legacy_jit* || "${JOB_BASE_NAME}" == *legacy_jit* ]]; then
+  test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
-  test_python_nn
-  test_cpp_extensions
+  install_torchvision
+  test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
   install_torchvision
-  test_python_all_except_nn_and_cpp_extensions
+  test_python_shard2
   test_aten
   test_libtorch
   test_custom_script_ops
@@ -409,9 +402,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4
   test_cpp_extensions
 else
   install_torchvision
-  test_python_nn
-  test_python_all_except_nn_and_cpp_extensions
-  test_cpp_extensions
+  test_python_shard1
+  test_python_shard2
   test_aten
   test_vec256
   test_libtorch
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
index 4bfb5bc85e66..d76637dd0db7 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
@@ -1,3 +1,3 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
+cd test && python run_test.py --exclude-jit-executor --verbose --determine-from="%1" && cd ..
 if ERRORLEVEL 1 exit /b 1
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b01184603918..a1b4096592a7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -118,11 +118,37 @@ For example:
 - modify your Python file `torch/__init__.py`
 - test functionality
 
-You do not need to repeatedly install after modifying Python files.
+You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall
+if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
 
 In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch`
 and `python setup.py clean`. Then you can install in `develop` mode again.
 
+### Tips and Debugging
+* A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/)
+with `brew install cmake` if you are developing on MacOS or Linux system.
+* Our `setup.py` requires Python >= 3.6
+* If you run into errors when running `python setup.py develop`, here are some debugging steps:
+  1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
+  your CMake works and can compile this simple Hello World program without errors.
+  2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
+  details along the way, which saves time the next time you build. If you're running into issues, you can always
+  `rm -rf build` from the toplevel `pytorch` directory and start over.
+  3. If you have made edits to the PyTorch repo, commit any change you'd like to keep and clean the repo with the
+  following commands (note that clean _really_ removes all untracked files and changes.):
+  ```bash
+  git submodule deinit -f .
+  git clean -xdf
+  python setup.py clean
+  git submodule update --init --recursive # very important to sync the submodules
+  python setup.py develop                 # then try running the command again
+  ```
+  4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
+  experiment with some environment variables, you can pass them into the command:
+  ```bash
+  ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
+  ```
+
 ## Nightly Checkout & Pull
 
 The `tools/nightly.py` script is provided to ease pure Python development of
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 5ec9d24eea39..839964e33c59 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -51,6 +51,7 @@ file(GLOB cudnn_cpp "cudnn/*.cpp")
 
 file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
 file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
 file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
 file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index a9182787d2e6..a7b4e694d52e 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -156,18 +156,29 @@ struct FunctionSchema {
     checkSchema();
   }
 
-  // check whether this schema is backward compatible with the old one.
-  // the following conditions are considered as this schema is backward
-  // compatible with old:
-  //   1) two schemas are equal
-  //   2) this schema has the same or more positional args than old,
-  //      and any positional arg in this schema is backward compatible
-  //      with the corresponding one in old schema, which could be an arg
-  //      or a kwarg, if it has, or it must provide a default value
-  //   3) this schema has the same or more kwargs than old, and all the kwargs
-  //      in old schema can find the corresponding kwarg in this schema which
-  //      is backward compatible with the old kwarg, and the extra kwargs in
-  //      this schema must provide default values.
+  // Checks whether this schema is backward compatible with the old one.
+  // The following conditions must be true:
+  // [Function structure] The new schema's name, overload-name, varargs, and
+  //      return arity are the same.
+  // [Output Narrowing] The new schema's output type must be the same class
+  //      or inherit from the old schema's output type.
+  // [Argument count] The new schema must have at least as many arguments as
+  //      the old schema (considering the list of positional and kwargs).
+  // [Arg Compatibility] Every argument in the old schema has a corresponding
+  //      argument in the new schema that:
+  //        * is at the same position.
+  //        * has the same name.
+  //        * is either positional, or kwarg and the old argument was kwarg.
+  //        * has the same type, or the old argument's type inherits from the
+  //          new argument's type.
+  // [Default Values] Every new argument must have a default value.
+  // E.g.
+  //   OK    f_new(a, b, c=1) => f_old(a, b)
+  //   NOK   f_new(a, c=1, *, b) => f_old(a, *, b)
+  //   OK    f_new(a, b, *, c) => f_old(a, *, b, c)
+  //   NOK   f_new(a, *, b, c) -> f_old(a, b, *, c)
+  //   NOK   f_new(a, *, c, b) => f_old(a, *, b, c)
+  //   OK    f_new(a, *, b, c, d=1) => f_old(a, *, b, c)
   bool isBackwardCompatibleWith(
       const FunctionSchema& old,
       std::ostream* why_not = nullptr) const;
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index bc9a68fbad3f..2185b35bc593 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -111,69 +111,35 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     return false;
   }
   for (size_t i = 0; i < returns().size(); ++i) {
-    // functions are covariant in arguments but contravariant in returns
+    // Backwards compatibility requires covariance on argument types
+    // (i.e. more generic), and contravariance on return types (i.e.
+    //  more specific).
     if (!old.returns().at(i).isBackwardCompatibleWith(
           returns().at(i),
           why_not)) {
       return false;
     }
   }
-  std::vector<const Argument*> args, old_args;
-  std::map<std::string, const Argument*> kwargs, old_kwargs;
-  auto split_func = [](const std::vector<Argument>& arguments,
-      std::vector<const Argument*>* positionals,
-      std::map<std::string, const Argument*>* nameds) {
-    for (const Argument& arg : arguments) {
-      if (!arg.kwarg_only()) {
-        positionals->emplace_back(&arg);
-      }
-      nameds->emplace(arg.name(), &arg);
-    }
-  };
-  // we split args into positional and keyward parts,
-  split_func(arguments(), &args, &kwargs);
-  split_func(old.arguments(), &old_args, &old_kwargs);
-  if (old_args.size() > args.size()) {
-    return false;
-  }
-  // make sure that all the old positional args have their corresponding
-  // backward compatible positional args in this schema
-  for (size_t i = 0; i < old_args.size(); ++i) {
-    if (!args.at(i)->isBackwardCompatibleWith(
-          *old_args.at(i),
-          why_not)) {
+
+  // Make sure that all the old arguments have their corresponding backward
+  // compatible arguments in this schema.
+  for (size_t i = 0; i < old.arguments().size(); ++i) {
+    if (!arguments().at(i).isBackwardCompatibleWith(
+          old.arguments().at(i), why_not)) {
       return false;
     }
   }
-  // check the extra positional args in this schema either has corresponding
-  // backward compatible keyward args since positional args also can be used as
-  // a keyward arg, or provided default values
-  for (size_t i = old_args.size(); i < args.size(); ++i) {
-    if (!args.at(i)->default_value()) {
-      auto it = old_kwargs.find(args.at(i)->name());
-      if (it == old_kwargs.end() ||
-          !args.at(i)->isBackwardCompatibleWith(
-            *it->second,
-            why_not)) {
-        return false;
+
+  // Validate that all new arguments provided a default value.
+  for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
+    if (!arguments().at(i).default_value()) {
+      if (why_not) {
+        *why_not
+            << "Function schema not backward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str()
+            << " did not provide a default value.";
       }
-    }
-  }
-  // make sure that all the keyword args in the old schema have their
-  // corresponding backward compatible keyward args in this schema
-  for (auto& kv : old_kwargs) {
-    auto it = kwargs.find(kv.first);
-    if (it == kwargs.end() ||
-        !it->second->isBackwardCompatibleWith(
-          *kv.second,
-          why_not)) {
-      return false;
-    }
-    kwargs.erase(it);
-  }
-  // check all the extra keyword args in this schema provide default values
-  for (auto& kv : kwargs) {
-    if (!kv.second->default_value()) {
       return false;
     }
   }
@@ -186,7 +152,6 @@ inline void FunctionSchema::checkArg(
     const Argument& argument,
     optional<size_t> pos) const {
   if (!value.type()->isSubtypeOf(argument.type())) {
-    std::string position = pos ? ::c10::str(" in position ", *pos) : "";
     TORCH_CHECK(
         false,
         formatTypeMismatchMsg(
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 58f6a8d53e92..28b9738034e7 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -28,6 +28,10 @@
 #include <miopen/version.h>
 #endif
 
+#ifndef USE_ROCM
+#include <ATen/cuda/detail/LazyNVRTC.h>
+#endif
+
 #include <cuda.h>
 
 #include <sstream>
@@ -116,10 +120,14 @@ bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
 
-#ifdef USE_DIRECT_NVRTC
+#if defined(USE_DIRECT_NVRTC)
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
   return std::make_pair(nullptr, at::cuda::load_nvrtc());
 }
+#elif !defined(USE_ROCM)
+static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
+  return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
+}
 #else
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
 #if defined(_WIN32)
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
new file mode 100644
index 000000000000..fae48c08b61f
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -0,0 +1,171 @@
+#include <ATen/cuda/detail/LazyNVRTC.h>
+
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/DynamicLibrary.h>
+#include <stdexcept>
+
+namespace at {
+namespace cuda {
+namespace detail {
+namespace _stubs {
+
+at::DynamicLibrary& getCUDALibrary() {
+#if defined(_WIN32)
+  static at::DynamicLibrary lib("nvcuda.dll");
+#else
+  static at::DynamicLibrary lib("libcuda.so.1");
+#endif
+  return lib;
+}
+
+at::DynamicLibrary& getNVRTCLibrary() {
+  constexpr auto major = CUDA_VERSION / 1000;
+  constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
+#if defined(_WIN32)
+  auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
+#else
+  static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
+#endif
+  static at::DynamicLibrary lib(libname.c_str());
+  return lib;
+}
+
+#define _STUB_1(LIB, NAME, RETTYPE, ARG1)                                            \
+RETTYPE NAME(ARG1 a1) {                                                              \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1);                                                                     \
+}
+
+#define _STUB_2(LIB, NAME, RETTYPE, ARG1, ARG2)                                      \
+RETTYPE NAME(ARG1 a1, ARG2 a2) {                                                     \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2);                                                                 \
+}
+
+#define _STUB_3(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3)                                \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3) {                                            \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3);                                                             \
+}
+
+#define _STUB_4(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3, ARG4)                          \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {                                   \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3, a4);                                                         \
+}
+
+#define CUDA_STUB1(NAME, A1) _STUB_1(CUDA, NAME, CUresult CUDAAPI, A1)
+#define CUDA_STUB2(NAME, A1, A2) _STUB_2(CUDA, NAME, CUresult CUDAAPI, A1, A2)
+#define CUDA_STUB3(NAME, A1, A2, A3) _STUB_3(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3)
+#define CUDA_STUB4(NAME, A1, A2, A3, A4) _STUB_4(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3, A4)
+
+#define NVRTC_STUB1(NAME, A1) _STUB_1(NVRTC, NAME, nvrtcResult, A1)
+#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
+#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)
+
+NVRTC_STUB2(nvrtcVersion, int*, int*);
+NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
+
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames) {
+  auto fn = reinterpret_cast<decltype(&nvrtcCreateProgram)>(getNVRTCLibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get nvrtcCreateProgram");
+  lazyNVRTC.nvrtcCreateProgram = fn;
+  return fn(prog, src, name, numHeaders, headers, includeNames);
+}
+
+NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
+NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
+NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
+_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
+NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
+NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
+
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
+CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
+CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
+CUDA_STUB2(cuGetErrorString, CUresult, const char **);
+CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
+CUDA_STUB1(cuModuleUnload, CUmodule);
+CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
+CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
+CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
+
+// Irregularly shaped functions
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra) {
+  auto fn = reinterpret_cast<decltype(&cuLaunchKernel)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLaunchKernel");
+  lazyNVRTC.cuLaunchKernel = fn;
+  return fn(f,
+            gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+            sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module,
+                                    const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuModuleLoadDataEx)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuModuleLoadDataEx");
+  lazyNVRTC.cuModuleLoadDataEx = fn;
+  return fn(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state,
+              CUjitInputType type,
+              void *data,
+              size_t size,
+              const char *name,
+              unsigned int numOptions,
+              CUjit_option *options,
+              void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuLinkAddData)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLinkAddData");
+  lazyNVRTC.cuLinkAddData = fn;
+  return fn(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+} // namespace _stubs
+
+NVRTC lazyNVRTC = {
+#define _REFERENCE_MEMBER(name) _stubs::name,
+  AT_FORALL_NVRTC(_REFERENCE_MEMBER)
+#undef _REFERENCE_MEMBER
+};
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.h b/aten/src/ATen/cuda/detail/LazyNVRTC.h
new file mode 100644
index 000000000000..810e1c322dbd
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at { namespace cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+
+namespace detail {
+extern NVRTC lazyNVRTC;
+}
+
+}}  // at::cuda::detail
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 17e24a38fdc7..e7e5659babbb 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -981,20 +981,12 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   auto m = self.size(-2);
   auto n = self.size(-1);
   auto mn = std::min(m, n);
-  Tensor iwork = at::empty({8*mn}, at::kInt);
+  Tensor iwork = at::empty({8 * mn}, at::kInt);
   auto iwork_data = iwork.data_ptr<int>();
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto mx = std::max(m, n);
-    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
-    if (jobz == 'N'){
-      lrwork = 7 * mn;
-    }else if (mx > 10 * mn){
-      lrwork = 7 * mn * mn + 7 * mn;
-    } else {
-      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
-    }
+    auto lrwork  = computeLRWorkDim(jobz, m, n);
     // rwork is an array of floats or doubles depending on the type
     rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype()));
     rwork_data = rwork.data_ptr<value_t>();
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 5c07700f1e85..4a6af18a5a96 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -318,4 +318,19 @@ static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_
   return reverse_permutation;
 }
 
+// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd
+// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
+static inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) {
+  auto mn = std::min(m, n);
+  auto mx = std::max(m, n);
+  // These settings are valid for on LAPACK 3.6+
+  if (jobz == 'N') {
+    return 5 * mn;
+  }
+  if (mx > 10 * mn) {
+    return 5 * mn * mn + 5 * mn;
+  }
+  return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn);
+}
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index 4a9c8cd7ad1a..a4eb075a5c3c 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -71,9 +71,11 @@ void replication_pad1d_out_cpu_template(
   int pad_l = paddingSize[0];
   int pad_r = paddingSize[1];
 
-  TORCH_CHECK(input_.numel() > 0
-      && (input_.ndimension() == 2 || input_.ndimension() == 3),
-      "non-empty 2D or 3D (batch mode) tensor expected for input");
+  // allow empty batch size but not other dimensions.
+  TORCH_CHECK((input_.dim() == 2 && input_.size(0) != 0 && input_.size(1) != 0) ||
+              (input_.dim() == 3 && input_.size(1) != 0 && input_.size(2) != 0),
+              "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+              input_.sizes());
 
   if (input_.ndimension() == 3)
   {
@@ -91,7 +93,6 @@ void replication_pad1d_out_cpu_template(
       "input (W: ", iwidth, ") is too small."
       " Calculated output W: ", owidth);
 
-
   /* get contiguous input */
   auto input = input_.contiguous();
 
@@ -216,6 +217,9 @@ Tensor& replication_pad1d_backward_out_cpu_template(
   /* get contiguous gradOutput */
   auto gradOutput = gradOutput_.contiguous();
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return gradInput;
+  }           
   gradInput.zero_();
 
   /* backprop */
@@ -339,8 +343,13 @@ void replication_pad2d_out_cpu_template(Tensor& output,
   int dimslices = 0;
   int64_t nbatch = 1;
 
-  TORCH_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4),
-      "3D or 4D (batch mode) tensor expected for input, but got: ", input_);
+  // allow 0 dim batch size and nothing else.
+  bool valid_dims = input_.size(1) != 0 && input_.size(2) != 0;
+  TORCH_CHECK(
+      (input_.dim() == 3 && input_.size(0) != 0 && valid_dims) ||
+      (input_.dim() == 4 && valid_dims && input_.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input_.sizes());
 
   if (input_.dim() == 4)
   {
@@ -510,6 +519,10 @@ Tensor& replication_pad2d_backward_out_cpu_template(
 
   /* resize */
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return gradInput;
+  }
+  
   gradInput.zero_();
 
   /* backprop */
@@ -557,8 +570,13 @@ static inline void shapeCheck3d(
   int dimd = 1;
   int dimslices = 0;
 
-  TORCH_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5),
-      "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
+  // allow batch size of 0-dim.
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
+  TORCH_CHECK(
+      (input.dim() == 4 && input.size(0) != 0 && valid_dims) ||
+      (input.dim() == 5 && valid_dims && input.size(4) != 0),
+      "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
 
   if (input.dim() == 5)
   {
@@ -872,6 +890,9 @@ Tensor& replication_pad3d_backward_out_cpu_template(
 
   /* resize */
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return gradInput;
+  }
   gradInput.zero_();
 
   /* backprop */
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index cf8d52a7a3c2..21e4d63b163b 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -409,6 +409,70 @@ Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
   return native::fft_irfft(x, last_shape, last_dim, norm);
 }
 
+Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "fftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n, options);
+  auto right_slice = result.slice(0, (n + 1) / 2, 0);
+  at::arange_out(right_slice, -(n/2), 0, 1);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "rfftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n/2 + 1, options);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+// If an array dim is specified, wraps them according to self.dim().
+// Otherwise returns a vector of all dims.
+DimVector default_alldims(const Tensor& self, c10::optional<IntArrayRef> dim_opt) {
+  DimVector dim;
+  if (dim_opt) {
+    IntArrayRef dim_unwrapped = *dim_opt;
+    dim.resize(dim_unwrapped.size());
+    for (int64_t i = 0; i < dim.size(); ++i) {
+      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim());
+    }
+  } else {
+    dim.resize(self.dim());
+    std::iota(dim.begin(), dim.end(), 0);
+  }
+  return dim;
+}
+
+Tensor fft_fftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = x_sizes[dim[i]] / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
+Tensor fft_ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = (x_sizes[dim[i]] + 1) / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
+
 // This is a pass-through wrapper function that does the size check and
 // inferences. The actual forward implementation function is called
 // at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU).
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 114ca93dae26..34911a2975e4 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -104,7 +104,11 @@ struct Dist {
 
   // Special general pnorm derivative if p is less than two
   struct lttdist_calc {
-    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) {
+      Vec result = (dist == 0.0) ? Vec(0) : (sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)));
+      result = Vec::blendv(result, Vec(0), (diff == Vec(0)) & (p < Vec(1)));
+      return result;
+    }
   };
 
   // Two norm
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 2a628bb54925..e9dfe2d9285d 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1420,17 +1420,8 @@ AT_ERROR("svd: MAGMA library not found in "
 
   magma_int_t* iwork;
   ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn);
-  // Copy-n-paste rwork size computation from BatchLinearAlgebra.cpp
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto mx = std::max(m, n);
-    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
-    if (jobz == MagmaNoVec){
-      lrwork = 7 * mn;
-    } else if (mx > 10 * mn){
-      lrwork = 7 * mn * mn + 7 * mn;
-    } else {
-      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
-    }
+    auto lrwork = computeLRWorkDim(jobchar, m, n);
     storage_rwork = pin_memory<value_t>(lrwork);
     rwork = static_cast<value_t*>(storage_rwork.data());
   }
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 385cac5c79e8..c43a2ae9877e 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -50,7 +50,9 @@ struct dists {
 
   // Special case backward when p is less than two
   struct lt_two {
-    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) {
+      return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1));
+    }
   };
 
   // Two norm
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index 515dc61eca64..b896a47afed9 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -217,14 +217,17 @@ void replication_pad1d_out_cuda_template(
   int numBatch = 1;
 
   int numInputDims = input.ndimension();
-  TORCH_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3),
-      "2D or 3D (batch mode) tensor expected for input")
-
-    if (numInputDims == 3) {
-      numBatch = input.size(0);
-      planeDim++;
-      dimw++;
-    }
+  TORCH_CHECK(
+              (numInputDims == 2 && input.size(0) != 0 && input.size(1) != 0) ||
+              (numInputDims == 3 && input.size(1) != 0 && input.size(2) != 0),
+              "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+              input.sizes());
+  
+  if (numInputDims == 3) {
+    numBatch = input.size(0);
+    planeDim++;
+    dimw++;
+  }
 
   int numPlanes = input.size(planeDim);
   int inputW = input.size(dimw);
@@ -234,13 +237,19 @@ void replication_pad1d_out_cuda_template(
       "input (W: ", inputW, ")is too small."
       " Calculated output W: ", outputW);
 
+  if (numInputDims == 2) {
+    output.resize_({numPlanes, outputW});    
+  } else {
+    output.resize_({numBatch, numPlanes, outputW});
+  }
+  
+  if (input.numel() == 0) {
+    return;
+  }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       input.scalar_type(), "replication_pad1d_cuda", [&] {
-
-
       if (numInputDims == 2) {
-        output.resize_({numPlanes, outputW});
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
         auto devInput = input_.packed_accessor64<scalar_t, 3>();
@@ -255,7 +264,6 @@ void replication_pad1d_out_cuda_template(
         replication_pad_forward_kernel1d <<<gridSize, blockSize, 0,
           at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput, padL, padR);
       } else {
-        output.resize_({numBatch, numPlanes, outputW});
         auto devInput = input.packed_accessor64<scalar_t, 3>();
         auto devOutput = output.packed_accessor64<scalar_t, 3>();
 
@@ -304,6 +312,9 @@ void replication_pad1d_backward_out_cuda_template(
       gradOutput.size(dimw));
 
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
   gradInput.zero_();
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -351,9 +362,12 @@ void replication_pad2d_out_cuda_template(
   int numBatch = 1;
 
   int numInputDims = input.dim();
-  TORCH_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ",
-      input)
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+  TORCH_CHECK(
+      (numInputDims == 3 && input.size(0) != 0 && valid_dims) ||
+      (numInputDims == 4 && valid_dims && input.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
 
   if (numInputDims == 4) {
     numBatch = input.size(0);
@@ -372,12 +386,21 @@ void replication_pad2d_out_cuda_template(
       "input (H: ", inputH, ", W: ", inputW, ") is too small."
       " Calculated output H: ", outputH, " W: ", outputW);
 
+  if (numInputDims == 3) {
+    output.resize_({numPlanes, outputH, outputW});    
+  } else {
+    output.resize_({numBatch, numPlanes, outputH, outputW});
+  }
+  
+  if (input.numel() == 0) {
+    return;
+  }
+
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       input.scalar_type(), "replication_pad2d_cuda", [&] {
 
 
       if (numInputDims == 3) {
-        output.resize_({numPlanes, outputH, outputW});
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
         auto devInput = input_.packed_accessor64<scalar_t, 4>();
@@ -393,7 +416,6 @@ void replication_pad2d_out_cuda_template(
         at::cuda::getCurrentCUDAStream()>>>(
             devInput, devOutput, padT, padB, padL, padR);
       } else {
-        output.resize_({numBatch, numPlanes, outputH, outputW});
         auto devInput = input.packed_accessor64<scalar_t, 4>();
         auto devOutput = output.packed_accessor64<scalar_t, 4>();
 
@@ -452,6 +474,9 @@ void replication_pad2d_backward_out_cuda_template(
       gradOutput.size(dimh));
 
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
   gradInput.zero_();
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -488,8 +513,12 @@ static inline void shapeCheck3d(
       "input tensor must fit into 32-bit index math");
   int numInputDims = input.dim();
 
-  TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
-      "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
+  TORCH_CHECK(
+       (numInputDims == 4 && input.size(0) != 0 && valid_dims) ||
+       (numInputDims == 5 && valid_dims && input.size(4) != 0),
+       "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+       input.sizes());
 
   int planeDim = 0;
   int dimd = 1;
@@ -526,8 +555,12 @@ static inline void shapeAndGradOutputCheck3d(
       "input tensor must fit into 32-bit index math");
   int numInputDims = input.dim();
 
-  TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
-      "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
+  TORCH_CHECK(
+      (numInputDims == 4 && valid_dims) ||
+      (numInputDims == 5 && valid_dims && input.size(4) != 0),
+      "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
 
   int planeDim = 0;
   int dimd = 1;
@@ -608,11 +641,20 @@ void replication_pad3d_out_cuda_template(
   int outputH = inputH + ptop + pbottom;
   int outputW  = inputW + pleft + pright;
 
+  if (numInputDims == 4) {
+    output.resize_({numPlanes, outputD, outputH, outputW});
+  } else {
+    output.resize_({numBatch, numPlanes, outputD, outputH, outputW});
+  }
+
+  if (input.numel() == 0) {
+    return;
+  }
+  
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       input.scalar_type(), "replication_pad3d_cuda", [&] {
 
       if (numInputDims == 4) {
-        output.resize_({numPlanes, outputD, outputH, outputW});
         auto input_ = input.unsqueeze(0);
         auto output_ = output.unsqueeze(0);
         auto devInput = input_.packed_accessor64<scalar_t, 5>();
@@ -629,7 +671,6 @@ void replication_pad3d_out_cuda_template(
         at::cuda::getCurrentCUDAStream()>>>(
             devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
       } else {
-        output.resize_({numBatch, numPlanes, outputD, outputH, outputW});
         auto devInput = input.packed_accessor64<scalar_t, 5>();
         auto devOutput = output.packed_accessor64<scalar_t, 5>();
 
@@ -679,6 +720,9 @@ void replication_pad3d_backward_out_cuda_template(
   }
 
   gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
   gradInput.zero_();
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 64fbf7a147e3..c27cb4083ac2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8211,6 +8211,26 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
 - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 43d0fc8ccd92..9f69c9d6ad6f 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -79,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
 list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 list(APPEND ATen_VEC256_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp  
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
   )
 
 # Caffe2 specific tests
diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
new file mode 100644
index 000000000000..d3391425e14b
--- /dev/null
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -0,0 +1,167 @@
+#include <gtest/gtest.h>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+#include <ATen/ATen.h>
+
+at::Tensor run_with_control_flow(
+    at::Tensor input,
+    at::Tensor conv_weight,
+    at::Tensor linear_weight,
+    bool cond,
+    std::vector<void*>& pointers,
+    bool record = false,
+    bool validate = false) {
+  if (cond) {
+    input = input * 2;
+  }
+  void* input_ptr = input.data_ptr();
+  auto conv_out = at::conv2d(input, conv_weight);
+  void* conv_out_ptr = input.data_ptr();
+  auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
+  auto output = at::linear(conv_out_flat, linear_weight);
+  if (record) {
+    pointers.push_back(input_ptr);
+    pointers.push_back(conv_out_ptr);
+  }
+  if (validate) {
+    TORCH_CHECK(input_ptr == pointers[0]);
+    TORCH_CHECK(conv_out_ptr == pointers[1]);
+  }
+  return output;
+}
+
+TEST(CPUAllocationPlanTest, with_control_flow) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, true, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode, bool validation_mode) -> bool {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output =
+        run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
+    }
+    bool success{true};
+    for (uint64_t i = 0; i < 10; ++i) {
+      bool validation_success;
+      {
+        c10::WithValidateAllocationPlanGuard
+          validation_guard(&plan, &validation_success);
+        output = run_with_control_flow(
+            a, conv_weight, linear_weight, validation_mode, pointers);
+      }
+      success = success && validation_success;
+    }
+    return success;
+  };
+  ASSERT_FALSE(validate_allocation_plan(false, true));
+  ASSERT_FALSE(validate_allocation_plan(true, false));
+  ASSERT_TRUE(validate_allocation_plan(true, true));
+  ASSERT_TRUE(validate_allocation_plan(false, false));
+}
+
+TEST(CPUAllocationPlanTest, with_profiling_alloc) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, false, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode,
+        bool validation_mode,
+        bool validate_pointers) {
+      pointers.clear();
+      c10::AllocationPlan plan;
+      {
+        c10::WithProfileAllocationsGuard profile_guard(&plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            record_mode,
+            pointers,
+            false,
+            false);
+      }
+      c10::CPUProfilingAllocator profiling_allocator;
+      {
+        c10::WithProfilingAllocatorGuard
+          profiling_allocator_guard(&profiling_allocator, &plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            validation_mode,
+            pointers,
+            validate_pointers,
+            false);
+      }
+      for (uint64_t i = 0; i < 10; ++i) {
+        {
+          c10::WithProfilingAllocatorGuard
+            profiling_allocator_guard(&profiling_allocator, &plan);
+          output = run_with_control_flow(
+              a,
+              conv_weight,
+              linear_weight,
+              validation_mode,
+              pointers,
+              false,
+              validate_pointers);
+        }
+      }
+  };
+  // When control flow conditions are same between profiling and evaluation
+  // profiling allocator should not throw.
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
+  // Furthermore profiling allocator should return the same pointers
+  // back for the intermediate tensors
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, true));
+
+  // When control flow conditions are different between profiling and evaluation
+  // profiling allocator should throw.
+  ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
+  ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 5502aaf4b3d6..c76fefe21d27 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/mobile/CPUCachingAllocator.h>
+#include <c10/mobile/CPUProfilingAllocator.h>
 
 // TODO: rename flags to C10
 C10_DEFINE_bool(
@@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       allocator_ptr->free(pointer);
+    } else if (profiling_allocator_ptr != nullptr) {
+      profiling_allocator_ptr->free(pointer);
     } else {
       c10::free_cpu(pointer);
       // This adds extra cost to freeing memory to the default case when
       // caching allocator is not enabled.
       CPUCachingAllocator::record_free(pointer);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_free(pointer);
+      }
     }
   }
 
@@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
     void* data;
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       data = allocator_ptr->allocate(alloc_size);
+    } else if (profiling_allocator_ptr != nullptr) {
+      data = profiling_allocator_ptr->allocate(alloc_size);
     } else {
       data = c10::alloc_cpu(alloc_size);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_allocation(alloc_size, data);
+      }
     }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 516aebba0747..f7f5b4f867a9 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -209,7 +209,15 @@ class C10_API DeviceGuardImplRegistrar {
   static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE(g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl());
 
 inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
-  auto p = device_guard_impl_registry[static_cast<size_t>(type)].load();
+  // Two adjacent int16_t fields DeviceType and DeviceIndex has field access
+  // miscompiled on NVCC. To workaround this issue, we apply a mask to the
+  // DeviceType. First check if the DeviceType is 16-bit.
+  // FB employees can see
+  //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
+  // for more details
+  static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
+
   // This seems to be the first place where you make use of a device
   // when you pass devices to factory functions.  Give a nicer error
   // message in this case.
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 5888207c5f80..966dd22e08fa 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -113,8 +113,8 @@
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
-// Enums only need to be exported on windows
-#ifdef _WIN32
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
 #define C10_API_ENUM C10_API
 #else
 #define C10_API_ENUM
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index f8e703e08746..4bd46fe97dfe 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -193,11 +193,14 @@ namespace at { namespace cuda { using namespace c10::hip; }}
 #define C10_DEVICE __device__
 #define C10_HOST __host__
 // constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
-// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
-// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
+// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5),
+// 1536 for Geforce Ampere (8.6),
+// and 2048 for all other architectures. You'll get warnings if you exceed these constants.
 // Hence, the following macros adjust the input values from the user to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
 #endif
diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp
new file mode 100644
index 000000000000..3559c8ce280f
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@@ -0,0 +1,410 @@
+#include <climits>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local AllocationPlanner* allocation_planner{nullptr};
+thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
+
+struct MemBlock {
+  uint64_t start_offset, end_offset;
+  MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
+  bool operator<(const MemBlock& other) const {
+    return end_offset <= other.start_offset;
+  }
+};
+
+bool validate_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_offsets) {
+  std::set<MemBlock> allocations;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // Skip allocations not managed by AllocationPlan
+    if (allocation_offsets[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto start_offset = allocation_offsets[i];
+    auto end_offset = allocation_offsets[i] + allocation_sizes[i];
+    if (!allocations.emplace(start_offset, end_offset).second) {
+      return false;
+    }
+  }
+  return true;
+}
+
+enum class EventType {
+  Allocate = 0,
+  Free,
+  Invalid
+};
+
+struct MemEvent {
+  uint64_t time;
+  uint64_t allocation_id;
+  uint64_t size;
+  EventType type{EventType::Invalid};
+  MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) :
+    time(t), allocation_id(id), size(s), type(e) {}
+};
+
+std::vector<MemEvent> create_and_sort_mem_events(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  std::vector<MemEvent> events;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // If observed allocation are freed outside the scope of
+    // observation, then allocations are not managed by the
+    // AllocationPlan.
+    if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
+    events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
+  }
+  std::sort(
+      events.begin(),
+      events.end(),
+      [](const MemEvent& a,
+         const MemEvent& b) -> bool {return a.time < b.time;});
+  return events;
+}
+
+std::vector<uint64_t> formulate_greedy_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  // Step 1. Construct all allocation/free events.
+  //         Sort these events by timestamp.
+  // Step 2. Iterate through all events.
+  //  2.1 If allocate event:
+  //      Find all candidate in free_size_to_offset map
+  //      Greedily pick the first one.
+  //      Remove the entry from free_size_to_offset map.
+  //      new_offset = offset + request_size
+  //      new_size = size - request_size
+  //      Add new entry to both maps
+  //  2.2 If free event.
+  //      Check if the returned offset merges with another chunk.
+  //      If so merge until no more merging is possible.
+  //      If returned offset does not merge, then
+  //      just return it as a chunk.
+
+  // lower_bound on this map will get all candidates of
+  // the right size for allocation.
+  std::map<uint64_t, uint64_t> free_size_to_offset;
+  // This provides fast lookup when we want to insert freed block
+  // back, especially when we want to merge blocks.
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_start_offset_to_size_iter;
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
+  // Upon free end_ptr = offset + size
+  // If end_ptr exists merge freed allocation
+  // Also find coresponding offset in size_to_offet
+  // Remove that entry and update with new size and offset
+  // If end_ptr does not exist then just insert offset,size
+  // in map and correspondingly size, offset in the other map.
+  // Merging should always be done recursively until no more chunks
+  // that can be found.
+  // After last free we should have only one entry left in these maps.
+  ska::flat_hash_map<uint64_t, uint64_t> allocated_offset_to_size;
+
+  std::vector<uint64_t> allocation_offsets(
+      allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
+  auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
+  uint64_t max_offset{0};
+  for (const auto& mem_event : mem_events) {
+    uint64_t alloc_offset;
+    uint64_t new_offset, new_size;
+    if (mem_event.type == EventType::Allocate) {
+      auto it = free_size_to_offset.lower_bound(mem_event.size);
+      if (it == free_size_to_offset.end()) {
+        // If there is no contiguous block of the size requested
+        // allocate a new one.
+        alloc_offset = max_offset;
+        max_offset += mem_event.size;
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      } else {
+        // If we have found a block of the size we want
+        // 1. change the block by allocating out of it.
+        //    1.1 Erase the entire block
+        //    1.2 Erase the reverse map entries
+        // 2. If block still has space left insert the remainder back in map.
+        //    Including reverse map entries.
+        // 3. Insert the allocated block in allocated_offset_to_size.
+        alloc_offset = it->second;
+        new_offset = alloc_offset + mem_event.size;
+        new_size = it->first - mem_event.size;
+        free_size_to_offset.erase(it);
+        free_start_offset_to_size_iter.erase(alloc_offset);
+        free_end_offset_to_size_iter.erase(alloc_offset + it->first);
+        if (new_size > 0) {
+          auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first;
+          free_start_offset_to_size_iter.emplace(new_offset, ref_it);
+          free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
+        }
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      }
+      allocation_offsets[mem_event.allocation_id] = alloc_offset;
+    } else {
+      // 1. Check if freed block is adjancent to an existing free block
+      //    at its end boundary. This is done by checking
+      //    free_end_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 2. Similarly check if freed block is adjacent to an existing
+      //    free block at start boundary. This is done by checking
+      //    free_start_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 3. Inser the freed block in map.
+      auto freed_offset = allocation_offsets[mem_event.allocation_id];
+      auto freed_size = mem_event.size;
+      auto end_offset = freed_offset + freed_size;
+      // Merge when another free block exist at the end of this block
+      auto end_it = free_end_offset_to_size_iter.find(end_offset);
+      if (end_it != free_end_offset_to_size_iter.end()) {
+        auto size_to_end_offset_iter = end_it->second;
+        freed_size += size_to_end_offset_iter->first;
+        free_size_to_offset.erase(size_to_end_offset_iter);
+        free_end_offset_to_size_iter.erase(end_it);
+      }
+      // Merge when freed block exist at the end of another free block
+      auto start_it = free_start_offset_to_size_iter.find(freed_offset);
+      if (start_it != free_start_offset_to_size_iter.end()) {
+        auto size_to_start_offset_iter = start_it->second;
+        freed_size += size_to_start_offset_iter->first;
+        freed_offset -= size_to_start_offset_iter->first;
+        free_size_to_offset.erase(size_to_start_offset_iter);
+        free_start_offset_to_size_iter.erase(start_it);
+      }
+      allocated_offset_to_size.erase(freed_offset);
+      auto freed_block_it =
+        free_size_to_offset.emplace(freed_size, freed_offset).first;
+      free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
+      free_end_offset_to_size_iter.emplace(
+          freed_offset + freed_size, freed_block_it);
+    }
+  }
+  TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets),
+      "Allocation plan invaild.");
+  return allocation_offsets;
+}
+
+} // namespace
+
+void AllocationPlan::clear() {
+  allocation_sizes.clear();
+  allocation_lifetimes.clear();
+  allocation_offsets.clear();
+}
+
+void AllocationPlanner::record_allocation(
+    const uint64_t size, const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_allocation(size, ptr);
+    return;
+  }
+  allocation_plan_->allocation_sizes.push_back(size);
+  allocation_plan_->allocation_lifetimes.push_back(
+      std::numeric_limits<uint64_t>::max());
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+}
+
+void AllocationPlanner::record_free(const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_free(ptr);
+    return;
+  }
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Free being recorded was allocated outside of WithProfileAllocationGuard
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during record_allocation.");
+  allocation_plan_->allocation_lifetimes[id] = allocation_id_;
+}
+
+bool AllocationPlanner::validate_allocation(
+    const uint64_t size, const void* ptr) {
+  if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
+      allocation_plan_->allocation_sizes[allocation_id_] != size) {
+    TORCH_WARN(
+        "Allocation request does not match plan:",
+        "Allocation id:",
+        allocation_id_,
+        ", Number of recorded allocations:",
+        allocation_plan_->allocation_sizes.size(),
+        ", Recorded size of the requested allocation:",
+        allocation_plan_->allocation_sizes[allocation_id_],
+        ", but got:",
+        size);
+
+    return false;
+  }
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+  return true;
+}
+
+bool AllocationPlanner::validate_free(const void* ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Allocation that was made outside the validation scope is being freed here
+    return true;
+  }
+  auto id = (*it).second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during validate_allocation.");
+  auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
+  return (lifetime_id == allocation_id_);
+}
+
+void AllocationPlanner::formulate_plan() {
+  allocation_plan_->allocation_offsets =
+    formulate_greedy_allocation_plan(
+        allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes);
+  allocation_plan_->total_size = 0;
+  for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) {
+    if (allocation_plan_->allocation_lifetimes[i] ==
+        std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i];
+    allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit);
+  }
+}
+
+void AllocationPlanner::clear() {
+  allocation_plan_->clear();
+  allocation_ptr_to_id_.clear();
+}
+
+void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
+  TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
+  plan_ = plan;
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  if (current_size_ < plan->total_size) {
+    // Free existing memory and reallocate for larger size.
+    c10::free_cpu(blob_);
+    blob_ = c10::alloc_cpu(plan->total_size);
+    current_size_ = plan->total_size;
+  }
+}
+
+void CPUProfilingAllocator::unset_plan() {
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  plan_ = nullptr;
+}
+
+void* CPUProfilingAllocator::allocate(const size_t bytes) {
+  TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_],
+      "Got allocation request that does not match with the plan.");
+  if (plan_->allocation_lifetimes[allocation_id_] ==
+      std::numeric_limits<uint64_t>::max()) {
+    // This allocation is not managed by ProfilingAllocator.
+    allocation_id_++;
+    return c10::alloc_cpu(bytes);
+  }
+  void* ptr =
+    reinterpret_cast<uint8_t*>(blob_) +
+    plan_->allocation_offsets[allocation_id_];
+  TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second);
+  allocation_id_++;
+  return ptr;
+}
+
+void CPUProfilingAllocator::free(void* const ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Either
+    // 1. Allocation that was made outside the validation scope is being freed here
+    // or
+    // 2. Allocation that is not managed by profiling allocator is being freed.
+    //    Example of the second type
+    //    Tensor out;
+    //    for (....) {
+    //      {
+    //        CPUProfilingAllocator
+    //        out = ...some op (This also frees previous memory held by out)
+    //      }
+    //      out is used..
+    //    }
+    c10::free_cpu(ptr);
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < plan_->allocation_lifetimes.size(),
+      "Freeing allocation that is not accordingly to the plan.");
+  auto lifetime_id = plan_->allocation_lifetimes[id];
+  TORCH_CHECK(
+      lifetime_id == allocation_id_,
+      "Lifetime of allocations do not match: allocation_id ",
+      id,
+      ", expected:",
+      lifetime_id,
+      ", got:",
+      allocation_id_);
+}
+
+CPUProfilingAllocator::~CPUProfilingAllocator() {
+  c10::free_cpu(blob_);
+}
+
+WithProfileAllocationsGuard::WithProfileAllocationsGuard(
+    AllocationPlan* plan) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan);
+  planner_->clear();
+  allocation_planner = planner_.get();
+}
+
+WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
+  planner_->formulate_plan();
+  allocation_planner = nullptr;
+}
+
+WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
+    AllocationPlan* plan, bool* success) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan, true);
+  success_ = success;
+  allocation_planner = planner_.get();
+}
+
+WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
+  *success_ = planner_->validation_success;
+  allocation_planner = nullptr;
+}
+
+AllocationPlanner* GetThreadLocalAllocationPlanner() {
+  return allocation_planner;
+}
+
+WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
+    CPUProfilingAllocator* allocator, const AllocationPlan* plan) {
+  // Nesting of profiling allocator is not supported.
+  TORCH_CHECK(profiling_allocator == nullptr,
+      "Nesting profiling allocators is not supported.");
+  profiling_allocator = allocator;
+  profiling_allocator->set_plan(plan);
+}
+
+WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
+  profiling_allocator->unset_plan();
+  profiling_allocator = nullptr;
+}
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
+  return profiling_allocator;
+}
+
+} // namespace c10
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
new file mode 100644
index 000000000000..4a7e79fe2857
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+/*
+ * Given a sequence of allocations in a thread, AllocationPlan records
+ * 1. size of each allocation
+ * 2. Lifetime of each allocation.
+ * 3. allocation offsets: Memory offset for each allocation in a single blob of memory
+ * 4. Total size of a blob of memory required to satisfy all the allocations.
+ */
+class C10_API AllocationPlan {
+  private:
+    // Records size of each allocation by their sequential allocation ids.
+    std::vector<uint64_t> allocation_sizes;
+    // This maps one allocation id (X) to another allocation id (Y).
+    // Allocation X is alive until allocation Y. From allocation Y onwards
+    // allocation X is not referenced.
+    // Thus Y is the id of the first allocation after X is freed.
+    // NB: When an allocation is recorded, along with recording its size,
+    // we also set the lifetime to be numeric_limits::max()
+    // This is to track allocations that are made during the scope of
+    // profiling but were not freed until after the scope ended.
+    // Such allocations are not managed by profiling allocator.
+    std::vector<uint64_t> allocation_lifetimes;
+    // Maps an allocation to some offset in a blob of memory.
+    std::vector<uint64_t> allocation_offsets;
+    uint64_t total_size{0};
+    void clear();
+    friend class AllocationPlanner;
+    friend class CPUProfilingAllocator;
+};
+
+/*
+ * Map of memory ptr to allocation id. This is auxiliary information only
+ * used to establish lifetime of allocations.
+ */
+class C10_API AllocationPlanner {
+  private:
+    AllocationPlan* allocation_plan_{nullptr};
+    // Maps allocated ptr to its allocation id.
+    // This is used when freeing the memory to lookup the allocation id
+    // in order to establish the lifetime of a particular allocation.
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+    uint64_t allocation_id_{0};
+    bool validation_mode_{false};
+
+    bool validate_allocation(const uint64_t size, const void* ptr);
+    bool validate_free(const void* ptr);
+  public:
+    bool validation_success{true};
+
+    AllocationPlanner() = delete;
+    AllocationPlanner(AllocationPlan* plan, bool validate = false) :
+      allocation_plan_(plan), validation_mode_(validate) {}
+    void record_allocation(const uint64_t size, const void* ptr);
+    void record_free(const void* ptr);
+    void formulate_plan();
+    void clear();
+};
+
+// NOT THREAD SAFE profiling allocator.
+class C10_API CPUProfilingAllocator {
+  private:
+    const AllocationPlan* plan_{nullptr};
+    uint64_t allocation_id_{0};
+    uint64_t current_size_{0};
+    void* blob_{nullptr};
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+  public:
+    ~CPUProfilingAllocator();
+    void set_plan(const AllocationPlan* plan);
+    void unset_plan();
+    void* allocate(const size_t bytes);
+    void free(void* const ptr);
+};
+
+/*
+ * Usage: Profile allocations made by one run of the model.
+ * AllocationPlan plan;
+ * {
+ *   WithProfileAllocationGuard profile_guard(&plan);
+ *   module.forward(...);
+ * }
+ * plan now contains allocation plan.
+ */
+class C10_API WithProfileAllocationsGuard {
+  public:
+    WithProfileAllocationsGuard(AllocationPlan* plan);
+    ~WithProfileAllocationsGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+};
+
+/*
+ * Usage: Validate allocation plan made with WithProfileAllocationGuard
+ * bool plan_validation_success, success = true;
+ * for (some number of representative inputs)
+ * {
+ *   WithValidateAllocationPlanGuard(&plan, &plan_validation_success);
+ *   module.forward(...);
+ *   success = success && plan_validation_success;
+ * }
+ * success == true means allocations are according to plan
+ * else for some inputs allocation pattern changed.
+ */
+class C10_API WithValidateAllocationPlanGuard {
+  public:
+    WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success);
+    ~WithValidateAllocationPlanGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+    bool* success_;
+};
+
+AllocationPlanner* GetThreadLocalAllocationPlanner();
+
+/*
+ * Usage: Allocate tensors accordingly to allocation plan
+ * First make allocation plan.
+ *  See WithProfileAllocationsGuard usage.
+ * Second validate allocation plan.
+ *  See WithValidateAllocationPlanGuard usage.
+ * CPUProfilingAllocator profiling_allocator;
+ * {
+ *   WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan);
+ *   module.forward(...);
+ * }
+ */
+class C10_API WithProfilingAllocatorGuard {
+  public:
+    WithProfilingAllocatorGuard(
+        CPUProfilingAllocator* allocator, const AllocationPlan* plan);
+    ~WithProfilingAllocatorGuard();
+};
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator();
+
+} // namespace c10
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 3f70e96fffc8..c7c0200e5880 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -17,10 +17,18 @@ C10_DEFINE_bool(
     "If used we will handle exceptions in executor threads. "
     "This avoids SIGABRT but may cause process to deadlock");
 
+C10_DEFINE_int(
+    caffe2_plan_executor_exception_timeout,
+    60,
+    "Number of seconds to wait for concurrent threads to stop on exception"
+    "before terminating.");
+
 namespace caffe2 {
 
 namespace {
 
+// ExceptionWrapper holds an exception. If exception pointers are being used,
+// it'll hold the original exception pointer otherwise just the message.
 class ExceptionWrapper {
  public:
   ExceptionWrapper() : hasException_(false) {}
@@ -39,6 +47,10 @@ class ExceptionWrapper {
 #endif
   }
 
+  const std::string& what() const {
+    return exceptionMsg_;
+  }
+
   operator bool() {
     return hasException_;
   }
@@ -51,6 +63,33 @@ class ExceptionWrapper {
   std::string exceptionMsg_;
 };
 
+// ExceptionWrapperTerminate terminates the program with the specified
+// exception. This preserves the exception ptr and ExceptionTracer will
+// correctly grab it on exit.
+class ExceptionWrapperTerminate {
+ public:
+  explicit ExceptionWrapperTerminate(ExceptionWrapper&& ew) : ew_(std::move(ew)) {}
+
+  ~ExceptionWrapperTerminate() {
+    ew_.rethrowException();
+  }
+
+ private:
+  ExceptionWrapper ew_;
+};
+
+// ScopeExitGuard runs the provided function when it's destructed.
+class ScopeExitGuard {
+ public:
+  explicit ScopeExitGuard(std::function<void()>&& f) : f_(std::move(f)) {}
+  ~ScopeExitGuard() {
+    f_();
+  }
+
+ private:
+  std::function<void()> f_;
+};
+
 struct NetDefInfo {
   const NetDef* netDef;
   // in order to keep the "override existing nets" on the top-level workflow,
@@ -460,9 +499,16 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) {
                 << " with " << step.substep().size() << " concurrent substeps";
 
         std::atomic<int> next_substep{0};
+        std::condition_variable cv;
+        std::atomic<int> done{0};
         std::mutex exception_mutex;
         ExceptionWrapper first_exception;
         auto worker = [&]() {
+          ScopeExitGuard on_exit([&] {
+            done += 1;
+            cv.notify_all();
+          });
+
           auto num_substeps = compiledStep->recurringSubsteps.size();
           int substep_id = next_substep++ % num_substeps;
           if (compiledStep->gotFailure) {
@@ -500,6 +546,23 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) {
         for (size_t i = 0; i < numThreads; ++i) {
           threads.emplace_back(worker);
         }
+
+        auto workersDone = [&] { return done == numThreads; };
+
+        // If we get an exception, try to wait for all threads to stop
+        // gracefully.
+        std::unique_lock<std::mutex> guard(exception_mutex);
+        cv.wait(guard, [&] { return workersDone() || first_exception; });
+        cv.wait_for(
+            guard,
+            std::chrono::seconds(FLAGS_caffe2_plan_executor_exception_timeout),
+            [&] { return workersDone(); });
+        if (!workersDone() && first_exception) {
+          LOG(ERROR) << "failed to stop concurrent workers after exception: "
+                     << first_exception.what();
+          ExceptionWrapperTerminate(std::move(first_exception));
+        }
+
         for (auto& thread : threads) {
           thread.join();
         }
diff --git a/caffe2/core/plan_executor_test.cc b/caffe2/core/plan_executor_test.cc
index 86f145d72a09..1b0eb0e718a2 100644
--- a/caffe2/core/plan_executor_test.cc
+++ b/caffe2/core/plan_executor_test.cc
@@ -67,6 +67,29 @@ class ErrorOp final : public Operator<CPUContext> {
 REGISTER_CPU_OPERATOR(Error, ErrorOp);
 OPERATOR_SCHEMA(Error).NumInputs(0).NumOutputs(0);
 
+static std::atomic<int> blockingErrorRuns{0};
+class BlockingErrorOp final : public Operator<CPUContext> {
+ public:
+  BlockingErrorOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    // First n op executions should block and then start throwing errors.
+    if (blockingErrorRuns.fetch_sub(1) >= 1) {
+      LOG(INFO) << "blocking";
+      while (true) {
+        std::this_thread::sleep_for(std::chrono::hours(10));
+      }
+    } else {
+      LOG(INFO) << "throwing";
+      throw TestError();
+    }
+  }
+};
+
+REGISTER_CPU_OPERATOR(BlockingError, BlockingErrorOp);
+OPERATOR_SCHEMA(BlockingError).NumInputs(0).NumOutputs(0);
+
 PlanDef parallelErrorPlan() {
   PlanDef plan_def;
 
@@ -101,10 +124,12 @@ PlanDef parallelErrorPlan() {
 }
 
 struct HandleExecutorThreadExceptionsGuard {
-  HandleExecutorThreadExceptionsGuard() {
+  HandleExecutorThreadExceptionsGuard(int timeout = 60) {
     globalInit({
         "caffe2",
         "--caffe2_handle_executor_threads_exceptions=1",
+        "--caffe2_plan_executor_exception_timeout=" +
+            caffe2::to_string(timeout),
     });
   }
 
@@ -139,6 +164,38 @@ TEST(PlanExecutorTest, ErrorAsyncPlan) {
   ASSERT_EQ(cancelCount, 1);
 }
 
+TEST(PlanExecutorTest, BlockingErrorPlan) {
+  ASSERT_DEATH(
+      [] {
+        HandleExecutorThreadExceptionsGuard guard(/*timeout=*/1);
+
+        PlanDef plan_def;
+
+        std::string plan_def_template = R"DOC(
+          network {
+            name: "net"
+            op {
+              type: "BlockingError"
+            }
+          }
+          execution_step {
+            num_concurrent_instances: 2
+            substep {
+              network: "net"
+            }
+          }
+        )DOC";
+
+        CAFFE_ENFORCE(
+            TextFormat::ParseFromString(plan_def_template, &plan_def));
+        Workspace ws;
+        blockingErrorRuns = 1;
+        ws.RunPlan(plan_def);
+        FAIL() << "shouldn't have reached this point";
+      }(),
+      "failed to stop concurrent workers after exception: test error");
+}
+
 } // namespace caffe2
 
 #endif
diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc
index 7acf854ba9da..f9fd39303261 100644
--- a/caffe2/operators/slice_op.cc
+++ b/caffe2/operators/slice_op.cc
@@ -17,7 +17,7 @@ Produces a slice of the input tensor.
 
 - Start and end indices are either passed as two 1D input tensors or using the `starts` and `ends` arguments.
 
-- If a negative value is passed for any of the start or end indices, it represents the number of elements before the end of that dimension. End indices are non-inclusive unless negative (end index -1 means up to and including the last element).
+- If a negative value is passed for any of the start or end indices, it represents |value| - 1 elements before the end of that dimension. End indices are non-inclusive unless negative (end index -1 means up to and including the last element).
 
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/slice_op.cc
@@ -67,11 +67,11 @@ print("Y:", workspace.FetchBlob("Y"))
     .Input(
         1,
         "starts",
-        "(*Tensor`<int>`*): 1D tensor of start-indices for each dimension of data")
+        "(*Tensor`<int>`*): 1D tensor of start-indices for each dimension of data (dimensions following the sliced one might be omitted)")
     .Input(
         2,
         "ends",
-        "(*Tensor`<int>`*): 1D tensor of end-indices for each dimension of data")
+        "(*Tensor`<int>`*): 1D tensor of end-indices for each dimension of data (dimensions following the sliced one might be omitted)")
     .Arg("starts", "(*Tuple(int)*): list of starting indices")
     .Arg("ends", "(*Tuple(int)*): list of ending indices")
     .TensorInferenceFunction([](const OperatorDef& def,
@@ -90,9 +90,10 @@ print("Y:", workspace.FetchBlob("Y"))
 
       for (int i = 0; i < data.dims_size(); ++i) {
         if (i >= starts.size()) {
+          dst_sizes[i] = data.dims(i);
           continue;
         }
-        if (data.dims_size() > 0) {
+        if (data.dims(i) > 0) {
           auto start = starts[i];
           auto end = ends[i];
           if (start < 0) {
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index 7a843fee3a52..184385310c9c 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -74,22 +74,23 @@ bool SliceImplGpu(
     if (i >= starts.numel()) {
       starts_idx[i] = 0;
       ends_idx[i] = data.size(i);
+      dst_sizes[i] = data.size(i);
       continue;
     }
     if (data.size(i) > 0) {
       auto start = starts_data[i];
       auto end = ends_data[i];
       if (start < 0) {
-        start = data.sizes()[i] + 1 + start;
+        start = data.size(i) + 1 + start;
       }
       if (end < 0) {
-        end = data.sizes()[i] + 1 + end;
+        end = data.size(i) + 1 + end;
       }
-      if (start > data.sizes()[i]) {
-        start = data.sizes()[i];
+      if (start > data.size(i)) {
+        start = data.size(i);
       }
-      if (end > data.sizes()[i]) {
-        end = data.sizes()[i];
+      if (end > data.size(i)) {
+        end = data.size(i);
       }
       CAFFE_ENFORCE_GE(start, 0);
       CAFFE_ENFORCE_GE(end, 0);
@@ -115,7 +116,7 @@ bool SliceImplGpu(
   // for now only supports slicing in 1 dimension
   int dim = -1;
   for (int i = 0; i < data.dim(); ++i) {
-    if (starts_idx[i] > 0 || ends_idx[i] < data.sizes()[i]) {
+    if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) {
       CAFFE_ENFORCE_EQ(
           dim, -1, "Currently only possible to slice in 1 dimension.");
       dim = i;
@@ -154,7 +155,7 @@ bool SliceImplGpu(
     size_t src_nbytes = data.nbytes();
     size_t dst_nbytes = output->nbytes();
 
-    size_t src_block_size = unit * data.sizes()[dim];
+    size_t src_block_size = unit * data.size(dim);
     size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
     size_t src_offset = unit * starts_idx[dim];
 
@@ -187,7 +188,7 @@ bool SliceImplGpu(
     size_t dst_nbytes = gdata->nbytes();
 
     size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
-    size_t dst_block_size = unit * data.sizes()[dim];
+    size_t dst_block_size = unit * data.size(dim);
     size_t dst_offset = unit * starts_idx[dim];
 
     if (num_blocks == 0 || dst_block_size == 0) {
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 8d1990e54c38..9706472315b6 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -33,23 +33,24 @@ bool SliceImpl(
   for (int i = 0; i < data.dim(); ++i) {
     if (i >= starts.numel()) {
       starts_idx[i] = 0;
-      ends_idx[i] = data.sizes()[i];
+      ends_idx[i] = data.size(i);
+      dst_sizes[i] = data.size(i);
       continue;
     }
-    if (data.sizes()[i] > 0) {
+    if (data.size(i) > 0) {
       auto start = starts_data[i];
       auto end = ends_data[i];
       if (start < 0) {
-        start = data.sizes()[i] + 1 + start;
+        start = data.size(i) + 1 + start;
       }
       if (end < 0) {
-        end = data.sizes()[i] + 1 + end;
+        end = data.size(i) + 1 + end;
       }
-      if (start > data.sizes()[i]) {
-        start = data.sizes()[i];
+      if (start > data.size(i)) {
+        start = data.size(i);
       }
-      if (end > data.sizes()[i]) {
-        end = data.sizes()[i];
+      if (end > data.size(i)) {
+        end = data.size(i);
       }
       CAFFE_ENFORCE_GE(start, 0);
       CAFFE_ENFORCE_GE(end, 0);
@@ -78,7 +79,7 @@ bool SliceImpl(
   // for now only supports slicing in 1 dimension
   int dim = -1;
   for (int i = 0; i < data.dim(); ++i) {
-    if (starts_idx[i] > 0 || ends_idx[i] < data.sizes()[i]) {
+    if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) {
       CAFFE_ENFORCE_EQ(
           dim, -1, "Currently only possible to slice in 1 dimension.");
       dim = i;
@@ -117,7 +118,7 @@ bool SliceImpl(
     size_t src_nbytes = data.nbytes();
     size_t dst_nbytes = output->nbytes();
 
-    size_t src_block_size = unit * data.sizes()[dim];
+    size_t src_block_size = unit * data.size(dim);
     size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
     size_t src_offset = unit * starts_idx[dim];
 
@@ -155,7 +156,7 @@ bool SliceImpl(
     size_t dst_nbytes = gdata->nbytes();
 
     size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
-    size_t dst_block_size = unit * data.sizes()[dim];
+    size_t dst_block_size = unit * data.size(dim);
     size_t dst_offset = unit * starts_idx[dim];
 
     if (num_blocks == 0 || dst_block_size == 0) {
diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
index 76fedeb488f8..7339d772f473 100644
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@@ -71,6 +71,17 @@ struct EndsWith {
   std::string suffix_;
 };
 
+struct StrEquals {
+  explicit StrEquals(OperatorBase& op)
+      : text_(op.GetSingleArgument<std::string>("text", "")) {}
+  bool operator()(const std::string& str) {
+    return str == text_;
+  }
+
+ private:
+  std::string text_;
+};
+
 struct Prefix {
   explicit Prefix(OperatorBase& op)
       : length_(op.GetSingleArgument<int>("length", 3)) {}
@@ -108,6 +119,9 @@ REGISTER_CPU_OPERATOR(
 REGISTER_CPU_OPERATOR(
     StringEndsWith,
     StringElementwiseOp<EndsWith, FixedType<bool>>);
+REGISTER_CPU_OPERATOR(
+    StringEquals,
+    StringElementwiseOp<StrEquals, FixedType<bool>>);
 REGISTER_CPU_OPERATOR(StringJoin, StringJoinOp<CPUContext>);
 
 OPERATOR_SCHEMA(StringPrefix)
@@ -164,6 +178,17 @@ Returns tensor of boolean of the same dimension of input.
     .Input(0, "strings", "Tensor of std::string.")
     .Output(0, "bools", "Tensor of bools of same shape as input.");
 
+OPERATOR_SCHEMA(StringEquals)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs equality check on each string in the input tensor.
+Returns tensor of booleans of the same dimension as input.
+)DOC")
+    .Arg("text", "The text to check input strings equality against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
 OPERATOR_SCHEMA(StringJoin)
     .NumInputs(1)
     .NumOutputs(1)
@@ -187,6 +212,7 @@ SHOULD_NOT_DO_GRADIENT(StringPrefix);
 SHOULD_NOT_DO_GRADIENT(StringSuffix);
 SHOULD_NOT_DO_GRADIENT(StringStartsWith);
 SHOULD_NOT_DO_GRADIENT(StringEndsWith);
+SHOULD_NOT_DO_GRADIENT(StringEquals);
 SHOULD_NOT_DO_GRADIENT(StringJoin);
 }
 } // namespace caffe2
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index eedb57be1d6c..a0c56a686666 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -119,6 +119,33 @@ def string_ends_with_ref(strings):
             [strings],
             string_ends_with_ref)
 
+    @given(strings=st.text(alphabet=['a', 'b']))
+    @settings(deadline=1000)
+    def test_string_equals(self, strings):
+        text = ""
+        if strings:
+            text = strings[0]
+
+        strings = np.array(
+            [str(a) for a in strings], dtype=np.object
+        )
+
+        def string_equals_ref(strings):
+            return (
+                np.array([a == text for a in strings], dtype=bool),
+            )
+
+        op = core.CreateOperator(
+            'StringEquals',
+            ['strings'],
+            ['bools'],
+            text=text)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_equals_ref)
+
 if __name__ == "__main__":
     import unittest
     unittest.main()
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 8b60915f7e00..c9ac37783d1c 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -478,7 +478,7 @@ foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration
 endforeach()
 
 # Set C++14 support
-set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror")
+set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror")
 if(MSVC)
   list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call")
   list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
@@ -490,7 +490,7 @@ endif()
 # OpenMP flags for NVCC with Clang-cl
 if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC"
   AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Xclang" "-fopenmp")
+  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp")
   if(MSVC_TOOLSET_VERSION LESS 142)
     list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp")
   else()
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index c152ca616571..f346fbe994e6 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -25,7 +25,6 @@ torch.*
 torch.nn
 ~~~~~~~~
 
--  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index b597fa9f51f3..b78ed2c08586 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -77,6 +77,261 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
     ``torch.backends.quantized.engine = 'qnnpack'``
 
+Quantization API Summary
+---------------------------------------
+
+There are three types of quantization supported in PyTorch:
+
+1. dynamic quantization (weights quantized with activations read/stored in
+   floating point and quantized for compute.)
+2. static quantization (weights quantized, activations quantized, calibration
+   required post training)
+3. quantization aware training (weights quantized, activations quantized,
+   quantization numerics modeled during training)
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Dynamic Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+This is the simplest to apply form of quantization where the weights are
+quantized ahead of time but the activations are dynamically quantized
+during inference. This is used for situations where the model execution time
+is dominated by loading weights from memory rather than computing the matrix
+multiplications. This is true for for LSTM and Transformer type models with
+small batch size.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                   /
+  linear_weight_fp32
+
+  # dynamically quantized model
+  # linear and conv weights are in int8
+  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
+                       /
+     linear_weight_int8
+
+API example::
+
+    import torch
+
+    # define a floating point model
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(M, self).__init__()
+            self.fc = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+    # create a model instance
+    model_fp32 = M()
+    # create a quantized model instance
+    model_int8 = torch.quantization.quantize_dynamic(
+        model_fp32,  # the original model
+        {torch.nn.Linear},  # a set of layers to dynamically quantize
+        dtype=torch.qint8)  # the target dtype for quantized weights
+
+    # run the model
+    input_fp32 = torch.randn(4, 4, 4, 4)
+    res = model_int8(input_fp32)
+
+To learn more about dynamic quantization please see our `dynamic quantization tutorial
+<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
+
+Static Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+Static quantization quantizes the weights and activations of the model.  It
+fuses activations into preceding layers where possible.  It requires
+calibration with a representative dataset to determine optimal quantization
+parameters for activations. Post Training Quantization is typically used when
+both memory bandwidth and compute savings are important with CNNs being a
+typical use case.  Static quantization is also known as Post Training
+Quantization or PTQ.
+
+Diagram::
+
+    # original model
+    # all tensors and computations are in floating point
+    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+        linear_weight_fp32
+
+    # statically quantized model
+    # weights and activations are in int8
+    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                        /
+      linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could be statically quantized
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # manually specify where tensors will be converted from floating
+          # point to quantized in the quantized model
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.relu(x)
+          # manually specify where tensors will be converted from quantized
+          # to floating point in the quantized model
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval mode for static quantization logic to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+
+  # Fuse the activations to preceding layers, where applicable.
+  # This needs to be done manually depending on the model architecture.
+  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+
+  # Prepare the model for static quantization. This inserts observers in
+  # the model that will observe activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+
+  # calibrate the prepared model to determine quantization parameters for activations
+  # in a real world setting, the calibration would be done with a representative dataset
+  input_fp32 = torch.randn(4, 1, 4, 4)
+  model_fp32_prepared(input_fp32)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, and replaces key operators with quantized
+  # implementations.
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about static quantization, please see the `static quantization tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Quantization Aware Training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Quantization Aware Training models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods.  During
+training, all calculations are done in floating point, with fake_quant modules
+modeling the effects of quantization by clamping and rounding to simulate the
+effects of INT8.  After model conversion, weights and
+activations are quantized, and activations are fused into the preceding layer
+where possible.  It is commonly used with CNNs and yields a higher accuracy
+compared to static quantization.  Quantization Aware Training is also known as
+QAT.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+      linear_weight_fp32
+
+  # model with fake_quants for modeling quantization numerics during training
+  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
+                             /
+     linear_weight_fp32 -- fq
+
+  # quantized model
+  # weights and activations are in int8
+  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                       /
+     linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could benefit from QAT
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.bn = torch.nn.BatchNorm2d(1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.bn(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to train mode for QAT logic to work
+  model_fp32.train()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
+      [['conv', 'bn', 'relu']])
+
+  # Prepare the model for QAT. This inserts observers and fake_quants in
+  # the model that will observe weight and activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
+
+  # run the training loop (not shown)
+  training_loop(model_fp32_prepared)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, fuses modules where appropriate,
+  # and replaces key operators with quantized implementations.
+  model_fp32_prepared.eval()
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about quantization aware training, please see the `QAT
+tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
 Quantized Tensors
 ---------------------------------------
 
@@ -121,79 +376,8 @@ cover typical CNN and RNN models
     torch.nn.quantized
     torch.nn.quantized.dynamic
 
-Quantization Workflows
-----------------------
-
-PyTorch provides three approaches to quantize models.
-
-.. _quantization tutorials:
-   https://pytorch.org/tutorials/#quantization-experimental
-
-1. Post Training Dynamic Quantization: This is the simplest to apply form of
-   quantization where the weights are quantized ahead of time but the
-   activations are dynamically quantized  during inference. This is used
-   for situations where the model execution time is dominated by loading
-   weights from memory rather than computing the matrix multiplications.
-   This is true for for LSTM and Transformer type models with small
-   batch size. Applying dynamic quantization to a whole model can be
-   done with a single call to :func:`torch.quantization.quantize_dynamic()`.
-   See the `quantization tutorials`_
-2. Post Training Static Quantization: This is the most commonly used form of
-   quantization where the weights are quantized ahead of time and the
-   scale factor and bias for the activation tensors is pre-computed
-   based on observing the behavior of the model during a calibration
-   process. Post Training Quantization is typically when both memory bandwidth
-   and compute savings are important with CNNs being a typical use case.
-   The general process for doing post training quantization is:
-
-
-
-   1. Prepare the model:
-
-      a. Specify where the activations are quantized and dequantized explicitly
-         by adding QuantStub and DeQuantStub modules.
-      b. Ensure that modules are not reused.
-      c. Convert any operations that require requantization into modules
-
-   2. Fuse operations like conv + relu or conv+batchnorm + relu together to
-      improve both model accuracy and performance.
-
-   3. Specify the configuration of the quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or
-      L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare` to insert modules
-      that will observe activation tensors during calibration
-   5. Calibrate the model by running inference against a calibration
-      dataset
-   6. Finally, convert the model itself with the
-      torch.quantization.convert() method. This does several things: it
-      quantizes the weights, computes and stores the scale and bias
-      value to be used each activation tensor, and replaces key
-      operators quantized implementations.
-
-   See the `quantization tutorials`_
-
-
-3. Quantization Aware Training: In the rare cases where post training
-   quantization does not provide adequate accuracy training can be done
-   with simulated quantization using the
-   :class:`torch.quantization.FakeQuantize`. Computations will take place in
-   FP32 but with values clamped and rounded to simulate the effects of INT8
-   quantization. The sequence of steps is very similar.
-
-
-   1. Steps (1) and (2) are identical.
-
-   3. Specify the configuration of the fake quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or Moving Average
-      or L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare_qat` to insert modules
-      that will simulate quantization during training.
-   5. Train or fine tune the model.
-   6. Identical to step (6) for post training quantization
-
-   See the `quantization tutorials`_
-
+Quantization Customizations
+---------------------------
 
 While default implementations of observers to select the scale factor and bias
 based on observed tensor data are provided, developers can provide their own
@@ -218,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module
 by module basis. Specifically, for all quantization techniques, the user needs to:
 
 1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form.
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
 2. Specify which parts of the model need to be quantized either by assigning
-   ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:
@@ -238,6 +428,13 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
+Best Practices
+--------------
+
+1. Set the ``reduce_range`` argument on observers to `True` if you are using the
+   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
+   by reducing the range of quantized data type by 1 bit.
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------
diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index 17e9fb26afa1..f74e2dc9f37e 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.6.0'
+    s.version          = '1.6.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'
diff --git a/setup.py b/setup.py
index b2270db497cf..c29ee929b8ca 100644
--- a/setup.py
+++ b/setup.py
@@ -346,7 +346,7 @@ def check_file(f):
 install_requires = [
     'future',
     'typing_extensions',
-    'dataclasses; python_version < "3.8"'
+    'dataclasses; python_version < "3.7"'
 ]
 
 missing_pydep = '''
diff --git a/test/cpp/rpc/test_e2e_process_group.cpp b/test/cpp/rpc/test_e2e_process_group.cpp
index d509a4606fa1..7c5af57d6a09 100644
--- a/test/cpp/rpc/test_e2e_process_group.cpp
+++ b/test/cpp/rpc/test_e2e_process_group.cpp
@@ -19,6 +19,7 @@ class TestE2EProcessGroup : public TestE2EBase {
     options.devices.push_back(
         ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
     std::chrono::milliseconds rpcTimeout(30000);
+    options.timeout = rpcTimeout;
 
     // Initialize server rpc agent.
     auto pg =
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 53a118e8f15b..9d0c19bef7b3 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,8 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests, skip_if_win32, create_device
+    skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \
+    create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -1594,13 +1595,30 @@ def create(num, prefix):
                 self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor)
             del pg
 
+class ProcessGroupNCCLNoGPUTest(TestCase):
+    MAIN_PROCESS_RANK = 0
+
+    def setUp(self):
+        self.rank = self.MAIN_PROCESS_RANK
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            raise unittest.SkipTest("GPUs are available, skipping test")
+
+    def tearDown(self):
+        pass
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_init_no_gpus(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"):
+            c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
 
-@requires_nccl()
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "TSAN is not fork-safe since we're forking in a multi-threaded environment",
-)
-@skip_if_rocm
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
@@ -1615,6 +1633,8 @@ def setUp(self):
     def tearDown(self):
         pass
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_empty_tensors(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1639,6 +1659,8 @@ def test_empty_tensors(self):
         pg.reduce_scatter(ys, xs).wait()
         self.assertEqual(0, ys[0].numel())
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_broadcast_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1661,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor):
             for i in range(self.num_gpus):
                 self.assertEqual(tensors[i], tensors[rt])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allreduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1722,6 +1746,8 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                 allreduce(tensors, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1752,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None):
                 with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                     reduce(tensors, self.rank, rt, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allgather_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1777,6 +1805,8 @@ def allgather(output_ts, input_ts):
             for s_idx, t in enumerate(device_ts):
                 self.assertEqual(torch.tensor([s_idx]), t)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_scatter_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1854,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op):
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             self.assertEqualIgnoreType(expected, output[i])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_barrier(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -2140,6 +2172,7 @@ def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self):
 
     @requires_gloo()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_gloo_backend_2gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 8d0f74349b3b..19e4952cad57 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -408,6 +408,43 @@ def test_over_slice():
             return a[3:10] == [3, 4]
         self.checkScript(test_backward_slice, ())
 
+    def test_slice_index(self):
+        a = torch.tensor(
+            [
+                [[1, 11], [2, 22]],
+                [[3, 33], [4, 44]],
+                [[5, 55], [6, 66]],
+            ]
+        )
+
+        def test_index_slice1(x):
+            x = x[:, :, [0, 1]]
+            return x
+        self.checkScript(test_index_slice1, (a,))
+
+        def test_index_slice2(x):
+            x = x[[2, 1, 0], :, :]
+            return x
+        self.checkScript(test_index_slice2, (a,))
+
+        def test_index_slice3(x):
+            x = x[[0, 1], :, [1]]
+            return x
+        self.checkScript(test_index_slice3, (a,))
+
+        def test_index_slice_empty_list(x):
+            empty_list: List[int] = []
+            x = x[empty_list, :, :]
+            return x
+        self.checkScript(test_index_slice_empty_list, (a,))
+
+        def test_index_slice_out_of_bounds_index(x):
+            x = x[[4], :, :]
+            return x
+        with self.assertRaisesRegex(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3"):
+            self.checkScript(test_index_slice_out_of_bounds_index, (a,))
+
+
     def test_mutable_list_append(self):
         def test_append():
             a = [0, 1]
diff --git a/test/run_test.py b/test/run_test.py
index 9ce938f6cd7b..2af7405e300b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -200,6 +200,15 @@
 
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
+JIT_EXECUTOR_TESTS = [
+    'test_jit_cuda_fuser_profiling',
+    'test_jit_cuda_fuser_legacy',
+    'test_jit_profiling',
+    'test_jit_legacy',
+    'test_jit_fuser_legacy',
+    'test_jit_fuser_te',
+    'test_tensorexpr']
+
 def print_to_stderr(message):
     print(message, file=sys.stderr)
 
@@ -456,7 +465,12 @@ def parse_args():
         type=int,
         help='runs a shard of the tests (taking into account other selections), e.g., '
         '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
-        'in the 2nd shard (the number of shards will be whichever argument is greater)',
+        'in the 2nd shard (the first number should not exceed the second)',
+    )
+    parser.add_argument(
+        '--exclude-jit-executor',
+        action='store_true',
+        help='exclude tests that are run for a specific jit config'
     )
     return parser.parse_args()
 
@@ -533,6 +547,9 @@ def get_selected_tests(options):
         assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}"
         selected_tests = selected_tests[which_shard - 1 :: num_shards]
 
+    if options.exclude_jit_executor:
+        options.exclude.extend(JIT_EXECUTOR_TESTS)
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e92fbcbf21bb..6bd6925e015f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6139,6 +6139,18 @@ def _test_euclidean_large_cdist(sizex, sizey=None):
         _test_cdist_for_size((1, 1), (S, 1))
         _test_euclidean_large_cdist((2000, 5))
 
+    # Ensure that cdist backward with p<1 does not produce NaNs
+    def test_cdist_grad_p_lt_1_no_nan(self, device):
+        for p in [0.99, 0.7, 0.5, 0.1, 0.01]:
+            x = torch.randn(1, 2, device=device)
+            y = x.clone().detach() + torch.tensor([[1., 0.]], device=device)
+            x.requires_grad = True
+            y.requires_grad = True
+            result = torch.cdist(x, y, p=p)
+            result.backward(torch.ones_like(result))
+            self.assertFalse(torch.isnan(x.grad).any())
+            self.assertFalse(torch.isnan(y.grad).any())
+
     def test_cdist_same_inputs(self, device):
         # Test to detect issues in cdist gradient calculation
         # When the distances are 0
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index f2ad2290d326..5a1527373478 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -14,90 +14,77 @@ def test_serialize_and_deserialize(self):
             self.assertEqual(parsed_schema, schema)
             self.assertTrue(parsed_schema.is_backward_compatible_with(schema))
 
-    def test_backward_compatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int dim) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int? dim) -> Tensor')
+    def test_backward_compatible_structure(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # BC: A new schema without changes.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim=5) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, *, Tensor out) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, *, bool extra1=True, Tensor out, bool extra2=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, Tensor out) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_ret(self):
-        old_schema = parse_schema('any(Tensor self) -> Tensor?')
-        new_schema = parse_schema('any(Tensor self) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_name(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any_(Tensor self, int dim, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with different name.
+        new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_vararg(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False, ...) -> Tensor')
+        # No-BC: A new schema with different overload name.
+        new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_returns(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, ...)')
+        # No-BC: A new schema that adds vararg.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> int')
+        # No-BC: A new schema with different number of outputs.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor?')
+
+    def test_backward_compatible_outputs(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # No-BC: A new schema with output becoming of optional type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) An schema where the output is not of optional type anymore.
         self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor out')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
 
-    def test_backward_incompatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor s, int[] dims, bool keepdim=False) -> Tensor')
+    def test_backward_compatible_arguments(self):
+        old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor')
+        # No-BC: A new schema with less arguments.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[3] dims, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with more arguments, appended, but no default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[](a) dims, bool keepdim=False) -> Tensor')
+        # BC: A new schema with more arguments, appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema with more arguments, not-appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where old kwargs becomes positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where an old positional argument becomes kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dims, bool keepdim=False) -> Tensor')
-        self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where all old kwargs become positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where all old positional arguments become kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dim, bool keepdim=False, bool? extra=None) -> Tensor')
+        # No-BC: A new schema where old kwargs appear in different order.
+        new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim) -> Tensor')
+        # BC: A new schema where argument becomes of type optional.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where argument gains a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema where argument is "renamed".
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False, bool extra) -> Tensor')
+        # No-BC: A new schema where argument type changes to an incompatible type.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx.py b/test/test_fx.py
index 9666ccd395bf..1451c5efe5cb 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -9,7 +9,7 @@
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental import shape_prop
 from torch.fx.experimental.Partitioner import DAG, Partitioner
-from torch.fx.experimental.subgraph_creation_example import split_module 
+from torch.fx.experimental.subgraph_creation_example import split_module
 
 from torch.fx.proxy import TraceError
 
@@ -30,6 +30,9 @@ class SimpleTest(torch.nn.Module):
     def forward(self, x):
         return torch.relu(x + 3.0)
 
+def a_non_torch_leaf(a, b):
+    return a + b
+
 class TestFX(JitTestCase):
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -84,6 +87,17 @@ def forward(self, A, b=4, *args, c=5, **kwargs):
         t = T()
         symbolic_trace(t)
 
+    def test_custom_import(self):
+        graph = torch.fx.Graph()
+        a = graph.placeholder('x')
+        b = graph.placeholder('y')
+        c = graph.call_function(a_non_torch_leaf, (a, b))
+        d = graph.call_function(torch.sin, (c,))
+        graph.output(d)
+        gm = GraphModule(torch.nn.Module(), graph)
+        x, y = torch.rand(1), torch.rand(1)
+        self.assertEqual(torch.sin(x + y), gm(x, y))
+
     def test_args_kwargs(self):
         class T(torch.nn.Module):
             def forward(self, *args, **kwargs):
@@ -803,8 +817,8 @@ def __init__(self):
                 self.linear = torch.nn.Linear(4, 5)
 
             def forward(self, x, y):
-                z = self.linear(x + self.param).clamp(min=0.0, max=1.0) 
-                w = self.linear(y).clamp(min=0.0, max=1.0) 
+                z = self.linear(x + self.param).clamp(min=0.0, max=1.0)
+                w = self.linear(y).clamp(min=0.0, max=1.0)
                 return z + w
 
         # symbolically trace model
@@ -821,7 +835,7 @@ def mod_partition(node: Node):
             partition_counter = (partition_counter + 1) % NPARTITIONS
             return partition
 
-        # split module in module with submodules 
+        # split module in module with submodules
         module_with_submodules = split_module(my_module_traced, my_module, mod_partition)
 
         x = torch.rand(3, 4)
@@ -832,5 +846,107 @@ def mod_partition(node: Node):
 
         self.assertEqual(orig_out, submodules_out)
 
+    @skipIfNoTorchVision
+    def test_replace_uses(self):
+        rn18 = resnet18()
+
+        class LowerReluTracer(torch.fx.Tracer):
+            def is_leaf_module(self, m : torch.nn.Module, qualname : str):
+                if isinstance(m, torch.nn.ReLU):
+                    return False
+                return super().is_leaf_module(m, qualname)
+
+        rn18_traced = GraphModule(rn18, LowerReluTracer().trace(rn18))
+
+        to_erase = []
+        for node in rn18_traced.graph.nodes:
+            if node.op == 'call_function' and node.target in [torch.relu, torch.nn.functional.relu]:
+                kwargs = node.kwargs
+                # Neg doesn't have in-place
+                kwargs.pop('inplace')
+                with torch.fx.graph.insert_before(node):
+                    new_node = rn18_traced.graph.call_function(
+                        the_function=torch.neg, args=node.args, kwargs=node.kwargs)
+                node.replace_all_uses_with(replace_with=new_node)
+                to_erase.append(node)
+
+        for node in to_erase:
+            rn18_traced.graph.erase_node(node)
+
+    def test_insertion_point(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        with torch.fx.graph.insert_before(b):
+            neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+            _, *relu_args = b.args
+            b.args = (neg, *relu_args)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+
+    def test_move_before(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+        _, *relu_args = b.args
+        b.args = (neg, *relu_args)
+        graph.move_node_before(to_move=neg, before=b)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+    def test_erase_node_error(self):
+        st = SimpleTest()
+        traced = symbolic_trace(st)
+
+        for node in traced.graph.nodes:
+            # Test deleting with uses both in another Node and at the output
+            if node.target in [operator.add, torch.relu]:
+                with self.assertRaisesRegex(RuntimeError, 'but it still had .* uses in the graph!'):
+                    traced.graph.erase_node(node)
+
+    def test_find_uses(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+
+        y = torch.relu(x)
+        z = x + x
+        u = torch.neg(x)
+        graph.output((y + z + u).node)
+        graph.lint()
+
+        uses_of_x = x.node.find_uses()
+        self.assertEqual(len(uses_of_x), 3)
+        expected_ops = ['relu', 'add', 'neg']
+        for node, expected in zip(uses_of_x, expected_ops):
+            assert expected in node.name
+
+    def test_multi_insert_point(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+        relu = torch.relu(x)
+
+        with torch.fx.graph.insert_before(relu.node):
+            y = torch.neg(x)
+            z = torch.tanh(y)
+
+        graph.output((relu.node, z.node))
+        graph.lint()
+
+        expected_ops = ['x', 'neg', 'tanh', 'relu']
+        for node, expected in zip(graph.nodes, expected_ops):
+            assert expected in node.name
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 494d70ecfcfb..d093a4b8826e 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10010,6 +10010,21 @@ def method(self, x):
         with self.assertRaisesRegex(RuntimeError, "Argument y not provided."):
             ModuleDefault()
 
+    def test_type_inferred_from_empty_annotation(self):
+        """
+        Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true`
+        """
+        @torch.jit.script
+        def fn(x):
+            return x
+
+        graph = fn.graph
+        n = next(graph.inputs())
+        self.assertTrue(n.type() == torch._C.TensorType.getInferred())
+
+        with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"):
+            fn(1)
+
     def test_script_define_order(self):
         class M(torch.jit.ScriptModule):
 
diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py
index 41e16df7d686..28ab78370637 100644
--- a/test/test_jit_cuda_fuser_legacy.py
+++ b/test/test_jit_cuda_fuser_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 
 import os
 os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py
index 7559b85519c4..5114ab190457 100644
--- a/test/test_jit_cuda_fuser_profiling.py
+++ b/test/test_jit_cuda_fuser_profiling.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
 
 import os
 os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index c33983e45e79..420075f6e611 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 2422e518a7f9..b17908e910bb 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index be02985e69a8..dc6bb2fbf878 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 910e4a17713d..23da6602c572 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=simple")
+sys.argv.append("--jit_executor=simple")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_nn.py b/test/test_nn.py
index e7a185c08951..ccf4ea7aa8d1 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10130,6 +10130,34 @@ def test_GroupNorm_empty(self, device):
             with torch.backends.cudnn.flags(enabled=False):
                 self._test_module_empty_input(mod, inp)
 
+    @onlyOnCPUAndCUDA
+    def test_ReplicationPad_empty(self, device):
+        for mod, inp in [
+                (torch.nn.ReplicationPad1d(3), torch.randn(0, 3, 10, device=device)),
+                (torch.nn.ReplicationPad2d(3), torch.randn(0, 3, 10, 10, device=device)),
+                (torch.nn.ReplicationPad3d(3), torch.randn(0, 3, 10, 10, 10, device=device))]:
+            self._test_module_empty_input(mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(NotImplementedError, 'Only 3D'):
+            mod = torch.nn.ReplicationPad1d(2)
+            inp = torch.randn(3, 10, device=device)
+            mod(inp)
+
+        with self.assertRaisesRegex(RuntimeError, 'Expected 2D or 3D'):
+            mod = torch.nn.ReplicationPad1d(2)
+            inp = torch.randn(3, 0, 10, device=device)
+            mod(inp)
+
+        with self.assertRaisesRegex(RuntimeError, 'Expected 3D or 4D'):
+            mod = torch.nn.ReplicationPad2d((2, 2, 2, 2))
+            inp = torch.randn(43, 0, 10, 10, device=device)
+            mod(inp)
+
+        with self.assertRaisesRegex(RuntimeError, 'Expected 4D or 5D'):
+            mod = torch.nn.ReplicationPad3d((2, 2, 2, 2, 2, 2))
+            inp = torch.randn(3, 0, 10, 10, 10, device=device)
+            mod(inp)
+
     @onlyOnCPUAndCUDA
     def test_ReflectionPad_empty(self, device):
         for mod, inp in [
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index d7ef731699b3..82ed2225bda8 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -510,6 +510,81 @@ def test_fftn_invalid(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expected a real input"):
             torch.fft.rfftn(c)
 
+    # Helper functions
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftfreq_numpy(self, device, dtype):
+        test_args = [
+            *product(
+                # n
+                range(1, 20),
+                # d
+                (None, 10.0),
+            )
+        ]
+
+        functions = ['fftfreq', 'rfftfreq']
+
+        for fname in functions:
+            torch_fn = getattr(torch.fft, fname)
+            numpy_fn = getattr(np.fft, fname)
+
+            for n, d in test_args:
+                args = (n,) if d is None else (n, d)
+                expected = numpy_fn(*args)
+                actual = torch_fn(*args, device=device, dtype=dtype)
+                self.assertEqual(actual, expected, exact_dtype=False)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftshift_numpy(self, device, dtype):
+        test_args = [
+            # shape, dim
+            *product(((11,), (12,)), (None, 0, -1)),
+            *product(((4, 5), (6, 6)), (None, 0, (-1,))),
+            *product(((1, 1, 4, 6, 7, 2),), (None, (3, 4))),
+        ]
+
+        functions = ['fftshift', 'ifftshift']
+
+        for shape, dim in test_args:
+            input = torch.rand(*shape, device=device, dtype=dtype)
+            input_np = input.cpu().numpy()
+
+            for fname in functions:
+                torch_fn = getattr(torch.fft, fname)
+                numpy_fn = getattr(np.fft, fname)
+
+                expected = numpy_fn(input_np, axes=dim)
+                actual = torch_fn(input, dim=dim)
+                self.assertEqual(actual, expected)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftshift_frequencies(self, device, dtype):
+        for n in range(10, 15):
+            sorted_fft_freqs = torch.arange(-(n // 2), n - (n // 2),
+                                            device=device, dtype=dtype)
+            x = torch.fft.fftfreq(n, d=1 / n, device=device, dtype=dtype)
+
+            # Test fftshift sorts the fftfreq output
+            shifted = torch.fft.fftshift(x)
+            self.assertTrue(torch.allclose(shifted, shifted.sort().values))
+            self.assertEqual(sorted_fft_freqs, shifted)
+
+            # And ifftshift is the inverse
+            self.assertEqual(x, torch.fft.ifftshift(shifted))
+
     # Legacy fft tests
     def _test_fft_ifft_rfft_irfft(self, device, dtype):
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index eaaee2dab836..b0777c7fa12a 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -6,7 +6,7 @@
 import torch
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, suppress_warnings,
+    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, TEST_WITH_ROCM, suppress_warnings,
      torch_to_numpy_dtype_dict, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
@@ -1048,7 +1048,9 @@ def test_logspace_special_steps(self, device, dtype):
             self._test_logspace_base2(device, dtype, steps=steps)
 
     @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))
+    @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) 
+                    if TEST_WITH_ROCM 
+                    else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
     def test_logspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 143c6dab91d2..739957569962 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1046,18 +1046,18 @@ def easy(x, y):
         # FIXME: interp.elapsed_value() also increments due to simplifier
         assert llvm.elapsed_value() == 1 or interp.elapsed_value() > 1
 
-    def test_unsqueeze(self):
+    def test_unsqueeze(self, N=256):
         def easy(x, y):
             a = torch.unsqueeze(x, 0)
             b = torch.unsqueeze(y, 0)
             return a + b
 
-        traced = torch.jit.trace(easy, (torch.ones(1024, 1024), torch.zeros(1024, 1024)))
+        traced = torch.jit.trace(easy, (torch.ones(N, N), torch.zeros(N, N)))
 
         llvm = LLVMCodeGenExecuted()
         interp = SimpleIREvalExecuted()
 
-        a = torch.rand(1024, 1024)
+        a = torch.rand(N, N)
         x = traced(a, a)
         npr = np.expand_dims(a, 0)
         npr = npr + npr
diff --git a/test/test_torch.py b/test/test_torch.py
index 0458e323b78f..7da38b211dc5 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -16684,7 +16684,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(),
+                  *([torch.float32, torch.float64, torch.bfloat16]
+                    if TEST_WITH_ROCM else torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_addmv(self, device, dtype):
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index a40ec48f2f37..56c44b904b47 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -12,10 +12,12 @@
 import io
 import itertools
 
+from torch.testing._internal.common_utils import TEST_WITH_TSAN
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKOps(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -161,6 +163,7 @@ def test_conv2d_transpose(self,
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKSerDes(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -551,6 +554,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKRewritePass(TestCase):
     @staticmethod
     def validate_transformed_module(
@@ -911,6 +915,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment")
 class TestXNNPACKConv1dTransformPass(TestCase):
     @staticmethod
     def validate_transform_conv1d_to_conv2d(
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 1d710393d5b7..fe9164007c33 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 1d710393d5b7588f5de3b83f51c22bbddf095229
+Subproject commit fe9164007c3392a12ea51a19b0f4e9f40d24f88d
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 7d0186538c98..1dbdca565792 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -7,14 +7,27 @@
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#include <ATen/ATen.h>
 
 using at::Tensor;
+using at::Device;
+using at::Layout;
 using at::Scalar;
-using at::MemoryFormat;
-using at::Generator;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
 using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
 
 using namespace torch::autograd::utils;
 
diff --git a/tools/clang_format_all.py b/tools/clang_format_all.py
index 710a21e33514..77ca68d92b0b 100755
--- a/tools/clang_format_all.py
+++ b/tools/clang_format_all.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-A script that runs clang-format on all C/C++ files in CLANG_FORMAT_WHITELIST. There is
+A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is
 also a diff mode which simply checks if clang-format would make any changes, which is useful for
 CI purposes.
 
@@ -14,22 +14,22 @@
 import sys
 from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH
 
-# Whitelist of directories to check. All files that in that directory
+# Allowlist of directories to check. All files that in that directory
 # (recursively) will be checked.
-# If you edit this, please edit the whitelist in clang_format_ci.sh as well.
-CLANG_FORMAT_WHITELIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
+# If you edit this, please edit the allowlist in clang_format_ci.sh as well.
+CLANG_FORMAT_ALLOWLIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
 
 # Only files with names matching this regex will be formatted.
 CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$")
 
 
-def get_whitelisted_files():
+def get_allowlisted_files():
     """
-    Parse CLANG_FORMAT_WHITELIST and resolve all directories.
-    Returns the set of whitelist cpp source files.
+    Parse CLANG_FORMAT_ALLOWLIST and resolve all directories.
+    Returns the set of allowlist cpp source files.
     """
     matches = []
-    for dir in CLANG_FORMAT_WHITELIST:
+    for dir in CLANG_FORMAT_ALLOWLIST:
         for root, dirnames, filenames in os.walk(dir):
             for filename in filenames:
                 if CPP_FILE_REGEX.match(filename):
@@ -77,7 +77,7 @@ async def file_clang_formatted_correctly(filename, semaphore, verbose=False):
 
 async def run_clang_format(max_processes, diff=False, verbose=False):
     """
-    Run clang-format to all files in CLANG_FORMAT_WHITELIST that match CPP_FILE_REGEX.
+    Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX.
     """
     # Check to make sure the clang-format binary exists.
     if not os.path.exists(CLANG_FORMAT_PATH):
@@ -97,7 +97,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
 
     # Format files in parallel.
     if diff:
-        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_whitelisted_files()]):
+        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]):
             ok &= await f
 
         if ok:
@@ -105,7 +105,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
         else:
             print("Some files not formatted correctly")
     else:
-        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_whitelisted_files()])
+        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()])
 
     return ok
 
@@ -134,7 +134,7 @@ def main(args):
     options = parse_args(args)
     # Get clang-format and make sure it is the right binary and it is in the right place.
     ok = get_and_check_clang_format(options.verbose)
-    # Invoke clang-format on all files in the directories in the whitelist.
+    # Invoke clang-format on all files in the directories in the allowlist.
     if ok:
         loop = asyncio.get_event_loop()
         ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose))
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b0479c01f58a..9ccc5f7cb899 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -709,6 +709,8 @@ class EnumType(JitType):
 class TensorType(JitType):
     @classmethod
     def get(cls) -> TensorType: ...
+    @classmethod
+    def getInferred(cls) -> TensorType: ...
 
 # Defined in torch/csrc/jit/python/python_tree_views.cpp
 class SourceRange:
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 2a1d0ef55fd9..58e780c87d1b 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -381,15 +381,15 @@ def jacobian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if there are a single
-            input and output, this will be a single Tensor containing the
-            Jacobian for the linearized inputs and output. If one of the two is
-            a tuple, then the Jacobian will be a tuple of Tensors. If both of
-            them are tuples, then the Jacobian will be a tuple of tuple of
-            Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
-            ``i``\th output and ``j``\th input and will have as size the
-            concatenation of the sizes of the corresponding output and the
-            corresponding input.
+        Jacobian (Tensor or nested tuple of Tensors): if there is a single
+        input and output, this will be a single Tensor containing the
+        Jacobian for the linearized inputs and output. If one of the two is
+        a tuple, then the Jacobian will be a tuple of Tensors. If both of
+        them are tuples, then the Jacobian will be a tuple of tuple of
+        Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
+        ``i``\th output and ``j``\th input and will have as size the
+        concatenation of the sizes of the corresponding output and the
+        corresponding input.
 
     Example:
 
@@ -476,12 +476,12 @@ def hessian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors) if there are a single input,
-            this will be a single Tensor containing the Hessian for the input.
-            If it is a tuple, then the Hessian will be a tuple of tuples where
-            ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
-            and ``j``\th input with size the sum of the size of the ``i``\th input plus
-            the size of the ``j``\th input.
+        Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input,
+        this will be a single Tensor containing the Hessian for the input.
+        If it is a tuple, then the Hessian will be a tuple of tuples where
+        ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
+        and ``j``\th input with size the sum of the size of the ``i``\th input plus
+        the size of the ``j``\th input.
 
     Example:
 
@@ -660,7 +660,9 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):
             hvp for said inputs, which is the expected mathematical value.
             Defaults to ``False``.
     Returns:
-        func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
             hvp (tuple of Tensors or Tensor): result of the dot product with
             the same shape as the inputs.
 
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 1c119ed75226..8a094ec9e235 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -166,4 +166,66 @@ inline Tensor ihfft(const Tensor& self,
   return torch::fft_ihfft(self, n, dim, norm);
 }
 
+/// Computes the discrete Fourier Transform sample frequencies for a signal of size n.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble);
+/// ```
+inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, d, options);
+}
+
+inline Tensor fftfreq(int64_t n, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, /*d=*/1.0, options);
+}
+
+/// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
+///
+/// Like torch.fft.rfft, only the positive frequencies are included.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble);
+/// ```
+inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, d, options);
+}
+
+inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, /*d=*/1.0, options);
+}
+
+/// Reorders n-dimensional FFT output to have negative frequency terms first, by
+/// a torch.roll operation.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
+/// ```
+inline Tensor fftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_fftshift(x, dim);
+}
+
+/// Inverse of torch.fft.fftshift
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto shift = torch::fft::fftshift(x)
+/// auto unshift = torch::fft::ifftshift(shift);
+/// assert(torch::allclose(x, unshift));
+/// ```
+inline Tensor ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_ifftshift(x, dim);
+}
+
 }} // torch::fft
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 5ddaf4a4855d..e952b0afc772 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -513,12 +513,10 @@ void GraphTask::exec_post_processing() {
 }
 
 void GraphTask::set_exception_without_signal(const std::shared_ptr<Node>& fn) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  if (!has_error_.load()) {
+  if (!has_error_.exchange(true)) {
     if (AnomalyMode::is_enabled() && fn) {
       fn->metadata()->print_stack(fn->name());
     }
-    has_error_ = true;
   }
 }
 
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 6cef307c7cce..780b129ab922 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -21,6 +21,10 @@ ncclComm_t* to_nccl_comm(torch::cuda::nccl::ncclComm_t* var) {
   return reinterpret_cast<ncclComm_t*>(var);
 }
 
+ncclComm_t to_nccl_comm(torch::cuda::nccl::ncclComm_t var) {
+  return reinterpret_cast<ncclComm_t>(var);
+}
+
 ncclUniqueId* to_nccl_unique_id(torch::cuda::nccl::ncclUniqueId* var) {
   return reinterpret_cast<ncclUniqueId*>(var);
 }
@@ -107,16 +111,20 @@ using namespace at;
 
 namespace detail {
 
+static inline void NCCL_CHECK(ncclResult_t result) {
+  NCCL_CHECK(from_nccl_result(result));
+}
+
 struct AutoNcclGroup {
   AutoNcclGroup() {
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    NCCL_CHECK(from_nccl_result(ncclGroupStart()));
+    NCCL_CHECK(ncclGroupStart());
 #endif
   }
   ~AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    NCCL_CHECK(from_nccl_result(ncclGroupEnd()));
+    NCCL_CHECK(ncclGroupEnd());
 #endif
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
   }
@@ -133,8 +141,8 @@ struct NcclCommList {
   int ndevices;
   NcclCommList(const std::vector<int>& devices)
       : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
-    NCCL_CHECK(from_nccl_result(
-      ncclCommInitAll(to_nccl_comm(comms.get()), devices.size(), devices.data())));
+    NCCL_CHECK(
+      ncclCommInitAll(to_nccl_comm(comms.get()), devices.size(), devices.data()));
   }
   NcclCommList(NcclCommList&& foo) = default;
   ~NcclCommList() {
@@ -326,7 +334,7 @@ void get_unique_id(ncclUniqueId& id)
 {
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
-  NCCL_CHECK(from_nccl_result(ncclGetUniqueId(to_nccl_unique_id(&id))));
+  NCCL_CHECK(ncclGetUniqueId(to_nccl_unique_id(&id)));
 #else
   AT_ERROR("PyTorch built without NCCL support");
 #endif
@@ -337,11 +345,11 @@ ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) {
   using namespace torch::cuda::nccl::detail;
   ncclComm_t comm;
   ncclUniqueId id = comm_id;
-  NCCL_CHECK(from_nccl_result(ncclCommInitRank(
+  NCCL_CHECK(ncclCommInitRank(
     to_nccl_comm(&comm),
     nranks,
     *(to_nccl_unique_id(&id)),
-    rank)));
+    rank));
   return comm;
 #else
   return nullptr;
@@ -362,8 +370,7 @@ void comm_destroy(ncclComm_t comm)
 
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
-  NCCL_CHECK(from_nccl_result(ncclCommDestroy(
-    *(to_nccl_comm(&comm)))));
+  NCCL_CHECK(ncclCommDestroy(to_nccl_comm(comm)));
 #endif
 }
 
@@ -420,8 +427,8 @@ void broadcast(
         count_max,
         ")");
     ncclComm_t comm = comms[i];
-    NCCL_CHECK(from_nccl_result(ncclBcast(
-        tensors[i].data_ptr(), numel, data_type, 0, *(to_nccl_comm(&comm)), stream)));
+    NCCL_CHECK(ncclBcast(
+        tensors[i].data_ptr(), numel, data_type, 0, to_nccl_comm(comm), stream));
   }
 #else
   AT_ERROR("PyTorch built without NCCL support");
@@ -460,15 +467,15 @@ void reduce(
         : streams[i]->stream();
 
     ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(from_nccl_result(ncclReduce(
+    NCCL_CHECK(ncclReduce(
         inputs[i].data_ptr(),
         root == i ? output.data_ptr() : nullptr,
         count,
         data_type,
         to_nccl_red_op(op),
         root,
-        *(to_nccl_comm(&comm)),
-        stream)));
+        to_nccl_comm(comm),
+        stream));
   }
 #else
   AT_ERROR("PyTorch built without NCCL support");
@@ -512,14 +519,14 @@ void all_reduce(
         : streams[i]->stream();
 
     ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(from_nccl_result(ncclAllReduce(
+    NCCL_CHECK(ncclAllReduce(
         inputs[i].data_ptr(),
         outputs[i].data_ptr(),
         count,
         data_type,
         to_nccl_red_op(op),
-        *(to_nccl_comm(&comm)),
-        stream)));
+        to_nccl_comm(comm),
+        stream));
   }
 #else
   AT_ERROR("PyTorch built without NCCL support");
@@ -554,14 +561,14 @@ void reduce_scatter(
         : streams[i]->stream();
 
     ncclComm_t comm = comms_ref[i];
-    NCCL_CHECK(from_nccl_result(ncclReduceScatter(
+    NCCL_CHECK(ncclReduceScatter(
         inputs[i].data_ptr(),
         outputs[i].data_ptr(),
         count,
         data_type,
         to_nccl_red_op(op),
-        *(to_nccl_comm(&comm)),
-        stream)));
+        to_nccl_comm(comm),
+        stream));
   }
 #else
   AT_ERROR("PyTorch built without NCCL support");
@@ -596,21 +603,21 @@ void all_gather(
 
     ncclComm_t comm = comms_ref[i];
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-      NCCL_CHECK(from_nccl_result(ncclAllGather(
+      NCCL_CHECK(ncclAllGather(
           inputs[i].data_ptr(),
           outputs[i].data_ptr(),
           count,
           data_type,
-          *(to_nccl_comm(&comm)),
-          stream)));
+          to_nccl_comm(comm),
+          stream));
 #else
-      NCCL_CHECK(from_nccl_result(ncclAllGather(
+      NCCL_CHECK(ncclAllGather(
           inputs[i].data_ptr(),
           count,
           data_type,
           outputs[i].data_ptr(),
-          *(to_nccl_comm(&comm)),
-          stream)));
+          to_nccl_comm(comm),
+          stream));
 #endif
   }
 #else
diff --git a/torch/csrc/distributed/c10d/c10d_frontend.h b/torch/csrc/distributed/c10d/c10d_frontend.h
index 9ff4b69999c7..816c8d9fe473 100644
--- a/torch/csrc/distributed/c10d/c10d_frontend.h
+++ b/torch/csrc/distributed/c10d/c10d_frontend.h
@@ -1,86 +1,226 @@
 #pragma once
 
-#include <torch/lib/c10d/ProcessGroup.hpp>
-#include <torch/lib/c10d/Store.hpp>
 #include <ATen/ATen.h>
 #include <c10/util/Optional.h>
+#include <torch/lib/c10d/ProcessGroup.hpp>
+#include <torch/lib/c10d/Store.hpp>
+#include <torch/lib/c10d/Types.hpp>
 
+#include <chrono>
+#include <memory>
 #include <string>
 #include <unordered_map>
-#include <memory>
-#include <chrono>
 
 namespace c10d {
 
 class Backend {
-    public:
-     // Maps to Backend.__new__ in Python.
-     static std::string get(std::string);
+ public:
+  // Maps to Backend.__new__ in Python.
+  static std::string get(std::string);
 
-     // TODO: How to support registering third_party backend?
-     static void registerBackend();
+  // TODO: How to support registering third_party backend?
+  static void registerBackend();
 
-    private:
-     // TODO: Should this be an enum list instead since this set doesn't
-     // change at all.
-     std::unordered_set<std::string> registered_backends_;
+ private:
+  // TODO: Should this be an enum list instead since this set doesn't
+  // change at all.
+  std::unordered_set<std::string> registered_backends_;
 };
 
-class DistributedC10d{
-    public:
-     void initProcessGroup(
-         const std::string& backend,
-         const std::string& init_method,
-         const std::chrono::milliseconds& timeout,
-         int64_t world_size,
-         int64_t rank,
-         std::shared_ptr<Store> store,
-         const std::string& group_name);
-
-     void destroyProcessGroup(std::shared_ptr<ProcessGroup> group);
-     int64_t getRank(std::shared_ptr<ProcessGroup> group);
-     int64_t getWorldSize(std::shared_ptr<ProcessGroup> group);
-
-     ProcessGroup::Work isend(at::Tensor tensor, int64_t dst, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
-     ProcessGroup::Work irecv(at::Tensor tensor, int64_t src, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
-
-    private:
-     DistributedC10d(){};
-
-     bool rankNotInGroup(std::shared_ptr<ProcessGroup> group) const;
-     int64_t getGroupRank(
-         std::shared_ptr<ProcessGroup> group,
-         const int64_t rank) const;
-     int64_t getGlobalRank(
-         std::shared_ptr<ProcessGroup> group,
-         const int64_t global_rank) const;
-     void checkDefaultPg() const;
-     int64_t getGroupSize(std::shared_ptr<ProcessGroup> group) const;
-     int64_t getBackend(std::shared_ptr<ProcessGroup> group);
-
-     std::string backend_;
-     // TODO: Ask Alex what kind of equality we need. It determine whether we
-     // need to use ProcessGroup or ProcesGroup* as key.
-     std::unordered_map<
-         std::shared_ptr<ProcessGroup>,
-         std::pair<std::shared_ptr<Backend>, std::shared_ptr<Store>>>
-         pg_map_;
-
-     // Note, this is different mapping relationship than original Python
-     // implementation.
-     std::unordered_map<std::shared_ptr<ProcessGroup>, std::string> pg_names_;
-
-     // Value is global_rank:group_rank mapping.
-     std::unordered_map<std::shared_ptr<ProcessGroup>, std::vector<int64_t>>
-         pg_group_ranks_;
-
-     std::shared_ptr<ProcessGroup> default_pg_;
-
-     // Default value should be "env://"
-     std::string default_pg_init_method_;
-
-     int64_t group_count_;
+class DistributedC10d {
+ public:
+  void initProcessGroup(
+      const std::string& backend,
+      const std::string& init_method,
+      const std::chrono::milliseconds& timeout,
+      int64_t world_size,
+      int64_t rank,
+      std::shared_ptr<Store> store,
+      const std::string& group_name);
+
+  void destroyProcessGroup(std::shared_ptr<ProcessGroup> group);
+  int64_t getRank(std::shared_ptr<ProcessGroup> group);
+  int64_t getWorldSize(std::shared_ptr<ProcessGroup> group);
+
+  ProcessGroup::Work isend(
+      at::Tensor tensor,
+      int64_t dst,
+      std::shared_ptr<ProcessGroup> group,
+      c10::optional<int64_t> tag);
+
+  ProcessGroup::Work irecv(
+      at::Tensor tensor,
+      int64_t src,
+      std::shared_ptr<ProcessGroup> group,
+      c10::optional<int64_t> tag);
+
+  ProcessGroup::Work send(
+      at::Tensor tensor,
+      int64_t dst,
+      std::shared_ptr<ProcessGroup> group,
+      c10::optional<int64_t> tag);
+
+  ProcessGroup::Work recv(
+      at::Tensor tensor,
+      int64_t src,
+      std::shared_ptr<ProcessGroup> group,
+      c10::optional<int64_t> tag);
+
+  c10::optional<ProcessGroup::Work> broadcastMultiGPU(
+      std::vector<at::Tensor> tensor_list,
+      int64_t src,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op,
+      int64_t src_tensor);
+
+  c10::optional<ProcessGroup::Work> broadcast(
+      at::Tensor tensor,
+      int64_t src,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> allReduceMultiGPU(
+      std::vector<at::Tensor>& tensor_list,
+      ReduceOp op,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> allReduce(
+      at::Tensor tensor,
+      ReduceOp op,
+      std::shred_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> allReduceCoalesced(
+      at::Tensor tensor,
+      ReduceOp op,
+      std::shred_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> reduceMultiGPU(
+      std::vector<at::Tensor>& tensor_list,
+      int64_t dst,
+      ReduceOp op,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op,
+      int64_t dst_tensor);
+
+  c10::optional<ProcessGroup::Work> reduce(
+      at::Tensor tensor,
+      int64_t dst,
+      ReduceOp op,
+      std::shared_ptr<ProcessGroup>& group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> allGatherMultiGPU(
+      std::vector<std::vector<at::Tensor>>& output_tensor_lists,
+      const std::vector<at::Tensor>& input_tensor_list,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+    // TODO TODO following APIs take python objects and unpickle them, how do we support these?
+    // ProcessGroup::Work allGatherObject()
+    // ProcessGroup::Work gatherObject()
+    // ProcessGroup::Work broadcastObjectList()
+
+  c10::optional<ProcessGroup::Work> allGather(
+      std::vector<at::Tensor>& tensor_list,
+      at::Tensor tensor,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> allGatherCoalesced(
+      std::vector<std::vector<at::Tensor>>& output_tensor_lists,
+      std::vector<at::Tensor>& input_tensor_list,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> gather(
+      at::Tensor tensor,
+      std::vector<at::Tensor>& gather_list,
+      int64_t dst,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  c10::optional<ProcessGroup::Work> scatter(
+      at::Tensor tensor,
+      std::vector<at::Tensor>& scatter_list,
+      int64_t dst,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  ProcessGroup::Work reduceScatterMultiGPU(
+      std::vector<at::Tensor>& output_tensor_list,
+      const std::vector<std::vector<at::Tensor>>& input_tensor_lists,
+      ReduceOp op,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  ProcessGroup::Work reduceScatter(
+      at::Tensor output,
+      const std::vector<at::Tensor>& input_list,
+      ReduceOp op,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  ProcessGroup::Work allToAllSingle(
+      at::Tensor output,
+      at::Tensor input,
+      const std::vector<int64_t>& output_split_sizes,
+      const std::vector<int64_t>& input_split_sizes,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  ProcessGroup::Work allToAll(
+      std::vector<at::Tensor>& output_tensor_list,
+      const std::vector<at::Tensor>& input_tensor_list,
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  ProcessGroup::Work barrier(
+      std::shared_ptr<ProcessGroup> group,
+      bool async_op);
+
+  std::shared_ptr<ProcessGroup> newGroup(
+      std::vector<int64_t> ranks,
+      std::chrono::milliseconds timeout,
+      Backend backend);
+
+ private:
+  DistributedC10d(){};
+
+  bool rankNotInGroup(std::shared_ptr<ProcessGroup> group) const;
+  int64_t getGroupRank(std::shared_ptr<ProcessGroup> group, const int64_t rank)
+      const;
+  int64_t getGlobalRank(
+      std::shared_ptr<ProcessGroup> group,
+      const int64_t global_rank) const;
+  void checkDefaultPg() const;
+  int64_t getGroupSize(std::shared_ptr<ProcessGroup> group) const;
+  int64_t getBackend(std::shared_ptr<ProcessGroup> group);
+
+  std::string backend_;
+  // TODO: Ask Alex what kind of equality we need. It determine whether we
+  // need to use ProcessGroup or ProcesGroup* as key.
+  std::unordered_map<
+      std::shared_ptr<ProcessGroup>,
+      std::pair<std::shared_ptr<Backend>, std::shared_ptr<Store>>>
+      pg_map_;
+
+  // Note, this is different mapping relationship than original Python
+  // implementation.
+  std::unordered_map<std::shared_ptr<ProcessGroup>, std::string> pg_names_;
+
+  // Value is global_rank:group_rank mapping.
+  std::unordered_map<std::shared_ptr<ProcessGroup>, std::vector<int64_t>>
+      pg_group_ranks_;
+
+  std::shared_ptr<ProcessGroup> default_pg_;
+
+  // Default value should be "env://"
+  std::string default_pg_init_method_;
+
+  int64_t group_count_;
 };
 
-
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 38a1811692c2..d15ea9d23412 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -949,6 +949,12 @@ that adds a prefix to each key inserted to the store.
       .def(py::init<>())
       .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream)
       .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout);
+  processGroupNCCL.def_static("_group_start", []() {
+    ::c10d::ProcessGroupNCCL::groupStart();
+  });
+  processGroupNCCL.def_static("_group_end", []() {
+    ::c10d::ProcessGroupNCCL::groupEnd();
+  });
 #endif
 
 #ifdef USE_C10D_MPI
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index f33079bcbab5..a0df3c784778 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -11,6 +11,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 9670968b8fe1..af4e127cc548 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -1,5 +1,6 @@
 #include <ATen/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index 76b8a9a145f1..b306cf04da0a 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -1,11 +1,12 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
-#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
 
+#include <cuda.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index c5889144bd1f..78d11e79eb03 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -747,7 +747,8 @@ void initPythonIRBindings(PyObject* module_) {
   py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
       .def_static("get", &FloatType::get);
   py::class_<TensorType, Type, std::shared_ptr<TensorType>>(m, "TensorType")
-      .def_static("get", &TensorType::get);
+      .def_static("get", &TensorType::get)
+      .def_static("getInferred", &TensorType::getInferred);
   py::class_<BoolType, Type, std::shared_ptr<BoolType>>(m, "BoolType")
       .def_static("get", &BoolType::get);
   py::class_<StringType, Type, std::shared_ptr<StringType>>(m, "StringType")
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index ba5ec8bdb5fc..44b7876c4787 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -25,3 +25,7 @@ def is_available():
     # this.
 
     from .distributed_c10d import _backend
+
+    # TODO: remove this once CI issue is resolved
+    # https://github.com/pytorch/pytorch/issues/42517
+    from .distributed_c10d import _P2POp, _batch_isend_irecv
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index ae4338cd28fc..a125d8a1204b 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,6 +1,7 @@
 import pickle
 import torch
 import warnings
+import contextlib
 from torch._six import string_classes
 from datetime import timedelta
 
@@ -159,8 +160,7 @@ class GroupMember(object):
 
 def _rank_not_in_group(group):
     """
-    Helper that checks if the current process's rank is not in a given group
-
+    Helper that checks if the current process's rank is not in a given group.
     """
     if group == GroupMember.WORLD:
         return False
@@ -170,8 +170,7 @@ def _rank_not_in_group(group):
 def _get_group_rank(group, rank):
     """
     Helper that gets a given group's local rank in the group from a given global
-    rank
-
+    rank.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -188,8 +187,7 @@ def _get_group_rank(group, rank):
 def _get_global_rank(group, group_rank):
     """
     Helper that gets a given group's global rank from a given local rank in the
-    group
-
+    group.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -204,8 +202,7 @@ def _get_global_rank(group, group_rank):
 def _check_default_pg():
     """
     Helper that checks if the default ProcessGroup has been initialized, with
-    assertion
-
+    assertion.
     """
     assert _default_pg is not None, \
         "Default process group is not initialized"
@@ -213,8 +210,7 @@ def _check_default_pg():
 
 def _get_group_size(group):
     """
-    Helper that gets a given group's world size
-
+    Helper that gets a given group's world size.
     """
     if group is GroupMember.WORLD:
         _check_default_pg()
@@ -227,7 +223,6 @@ def _get_group_size(group):
 def _check_single_tensor(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a single tensor.
-
     """
     if not isinstance(param, torch.Tensor):
         raise RuntimeError("Invalid function argument. Expected parameter `{}` "
@@ -237,7 +232,6 @@ def _check_single_tensor(param, param_name):
 def _check_tensor_list(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a list of tensors.
-
     """
     if not isinstance(param, list) or \
        not all(isinstance(p, torch.Tensor) for p in param):
@@ -245,10 +239,34 @@ def _check_tensor_list(param, param_name):
                            "to be of type List[torch.Tensor].".format(param_name))
 
 
+def _check_op(op):
+    """
+    Helper to check that the ``op`` is either isend or irecv.
+    """
+    if op not in [isend, irecv]:
+        raise RuntimeError("Invalid ``op``. Expected ``op`` "
+                           "to be of type ``torch.distributed.isend`` or "
+                           "``torch.distributed.irecv``.")
+
+def _check_p2p_op_list(p2p_op_list):
+    """
+    Helper to check that the ``p2p_op_list`` is a list of _P2POp instances and
+    all ops use the same backend.
+    """
+    if not isinstance(p2p_op_list, list) or \
+       not all(isinstance(p2p_op, _P2POp) for p2p_op in p2p_op_list):
+        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
+                           "to be of type ``torch.distributed._P2POp``.")
+
+
+    backend = get_backend(p2p_op_list[0].group)
+    if not all(backend == get_backend(p2p_op.group) for p2p_op in p2p_op_list):
+        raise RuntimeError("All groups need to use the same backend.")
+
+
 def is_mpi_available():
     """
     Checks if the MPI backend is available.
-
     """
     return _MPI_AVAILABLE
 
@@ -256,7 +274,6 @@ def is_mpi_available():
 def is_nccl_available():
     """
     Checks if the NCCL backend is available.
-
     """
     return _NCCL_AVAILABLE
 
@@ -264,7 +281,6 @@ def is_nccl_available():
 def is_gloo_available():
     """
     Checks if the Gloo backend is available.
-
     """
     return _GLOO_AVAILABLE
 
@@ -272,7 +288,6 @@ def is_gloo_available():
 def is_initialized():
     """
     Checking if the default process group has been initialized
-
     """
     return _default_pg is not None
 
@@ -280,7 +295,6 @@ def is_initialized():
 def _get_default_group():
     """
     Getting the default process group created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -291,7 +305,6 @@ def _get_default_group():
 def _get_default_store():
     """
     Getting the default store created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -757,6 +770,94 @@ def recv(tensor,
         return src
 
 
+class _P2POp(object):
+    """
+    A class to build point-to-point operations for ``_batch_isend_irecv``.
+
+    This class builds the type of P2P operation, communication buffer, peer rank,
+    Process Group group, and tag. Instances of this class will be passed to
+    ``_batch_isend_irecv`` for point-to-point communications.
+
+    Arguments:
+        op (callable): A function to send data to or receive data from a peer process.
+            The type of ``op`` is either ``torch.distributed.isend`` or
+            ``torch.distributed.irecv``.
+        tensor (Tensor): Tensor to send or receive.
+        peer (int): Destination or source rank.
+        group (ProcessGroup, optional): The process group to work on.
+        tag (int, optional): Tag to match send with recv.
+    """
+    def __init__(self, op, tensor, peer, group=group.WORLD, tag=0):
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = group
+        self.tag = tag
+
+    def __new__(cls, op, tensor, peer, group=group.WORLD, tag=0):
+        _check_op(op)
+        _check_single_tensor(tensor, "tensor")
+        return object.__new__(cls)
+
+
+@contextlib.contextmanager
+def _batch_p2p_manager(backend):
+    if backend == Backend.NCCL:
+        ProcessGroupNCCL._group_start()
+    try:
+        yield
+    finally:
+        if backend == Backend.NCCL:
+            ProcessGroupNCCL._group_end()
+
+
+def _batch_isend_irecv(p2p_op_list):
+    """
+    Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+    Process each of the operations in p2p_op_list and return the corresponding
+    requests. NCCL and Gloo backend are currently supported.
+
+    Arguments:
+        p2p_op_list: A list of point-to-point operations(type of each operator is
+            ``torch.distributed._P2POp``). The order of the isend/irecv in the list
+            matters and it needs to match with corresponding isend/irecv on the
+            remote end.
+
+    Returns:
+        A list of distributed request objects returned by calling the corresponding
+        op in the op_list.
+
+    Examples:
+        >>> send_tensor = torch.arange(2) + 2 * rank
+        >>> recv_tensor = torch.randn(2)
+        >>> send_op = dist._P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
+        >>> recv_op = dist._P2POp(dist.irecv, recv_tensor, (rank + 1)%world_size)
+        >>> reqs = _batch_isend_irecv([send_op, recv_op])
+        >>> for req in reqs:
+        >>>     req.wait()
+        >>> recv_tensor
+        tensor([2, 3])     # Rank 0
+        tensor([0, 1])     # Rank 1
+    """
+    _check_p2p_op_list(p2p_op_list)
+    backend = get_backend(p2p_op_list[0].group)
+    reqs = []
+    with _batch_p2p_manager(backend):
+        for p2p_op in p2p_op_list:
+            op = p2p_op.op
+            tensor = p2p_op.tensor
+            peer = p2p_op.peer
+            curr_group = p2p_op.group
+            tag = p2p_op.tag
+
+            ret = op(tensor, peer, curr_group, tag)
+
+            if ret is not None:
+                reqs.append(ret)
+    return reqs
+
+
 def broadcast_multigpu(tensor_list,
                        src,
                        group=group.WORLD,
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 01f3dd520174..319d2dd01b66 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -16,14 +16,14 @@ class Categorical(Distribution):
 
     Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is ``probs.size(-1)``.
 
-    If :attr:`probs` is 1D with length-`K`, each element is the relative
+    If :attr:`probs` is 1-dimensional with length-`K`, each element is the relative
     probability of sampling the class at that index.
 
-    If :attr:`probs` is 2D, it is treated as a batch of relative probability
-    vectors.
+    If :attr:`probs` is N-dimensional, the first N-1 dimensions are treated as a batch of 
+    relative probability vectors.
 
     .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
-              and it will be normalized to sum to 1.
+              and it will be normalized to sum to 1 along the last dimension.
 
     See also: :func:`torch.multinomial`
 
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 3e4bcc35464b..b3ffdb24b4a0 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -2,6 +2,12 @@
 
 import torch
 from torch._C import _add_docstr, _fft  # type: ignore
+from torch._torch_docs import factory_common_args
+
+__all__ = ['fft', 'ifft', 'fftn', 'ifftn',
+           'rfft', 'irfft', 'rfftn', 'irfftn', 'hfft', 'ihfft',
+           'fftfreq', 'rfftfreq', 'fftshift', 'ifftshift',
+           'Tensor']
 
 Tensor = torch.Tensor
 
@@ -549,3 +555,180 @@
     tensor([ 2.0000+-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j,
         -0.5000+0.6882j])
 """)
+
+fftfreq = _add_docstr(_fft.fft_fftfreq, r"""
+fftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the discrete Fourier Transform sample frequencies for a signal of size :attr:`n`.
+
+Note:
+    By convention, :func:`~torch.fft.fft` returns positive frequency terms
+    first, followed by the negative frequencies in reverse order, so that
+    ``f[-i]`` for all :math:`0 < i \leq n/2`` in Python gives the negative
+    frequency terms. For an FFT of length :attr:`n` and with inputs spaced in
+    length unit :attr:`d`, the frequencies are::
+
+        f = [0, 1, ..., (n - 1) // 2, -(n // 2), ..., -1] / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftfreq` follows NumPy's
+    convention of taking it to be negative.
+
+Args:
+    n (int): the FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.fftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    For even input, we can see the Nyquist frequency at ``f[2]`` is given as
+    negative:
+
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+rfftfreq = _add_docstr(_fft.fft_rfftfreq, r"""
+rfftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the sample frequencies for :func:`~torch.fft.rfft` with a signal of size :attr:`n`.
+
+Note:
+    :func:`~torch.fft.rfft` returns Hermitian one-sided output, so only the
+    positive frequency terms are returned. For a real FFT of length :attr:`n`
+    and with inputs spaced in length unit :attr:`d`, the frequencies are::
+
+        f = torch.arange((n + 1) // 2) / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. Unlike :func:`~torch.fft.fftfreq`,
+    :func:`~torch.fft.rfftfreq` always returns it as positive.
+
+Args:
+    n (int): the real FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.rfftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000])
+
+    >>> torch.fft.rfftfreq(4)
+    tensor([ 0.0000,  0.2500, 0.5000])
+
+    Compared to the output from :func:`~torch.fft.fftfreq`, we see that the
+    Nyquist frequency at ``f[2]`` has changed sign:
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+fftshift = _add_docstr(_fft.fft_fftshift, r"""
+fftshift(input, dim=None) -> Tensor
+
+Reorders n-dimensional FFT data, as provided by :func:`~torch.fft.fftn`, to have
+negative frequency terms first.
+
+Note:
+    By convention, the FFT returns positive frequency terms first, followed by
+    the negative frequencies in reverse order, so that ``f[-i]`` for all
+    :math:`0 < i \leq n/2` in Python gives the negative frequency terms.
+    :func:`~torch.fft.fftshift` rearranges all frequencies into ascending order
+    from negative to positive with the zero-frequency term in the center.
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftshift` always puts the
+    Nyquist term at the 0-index. This is the same convention used by
+    :func:`~torch.fft.fftfreq`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(4)
+    >>> f
+    tensor([ 0.0000,  0.2500,  -0.5000, -0.2500])
+
+    >>> torch.fftshift(f)
+    tensor([-0.5000, -0.2500, 0.0000, 0.2500])
+
+    Also notice that the Nyquist frequency term at ``f[2]`` was moved to the
+    beginning of the tensor.
+
+    This also works for multi-dimensional transforms:
+    >>> x = torch.fft.fftfreq(5, d=1/5) + 0.1 * torch.fft.fftfreq(5, d=1/5).unsqueeze(1)
+    >>> x
+    tensor([[ 0.0000,  1.0000,  2.0000, -2.0000, -1.0000],
+            [ 0.1000,  1.1000,  2.1000, -1.9000, -0.9000],
+            [ 0.2000,  1.2000,  2.2000, -1.8000, -0.8000],
+            [-0.2000,  0.8000,  1.8000, -2.2000, -1.2000],
+            [-0.1000,  0.9000,  1.9000, -2.1000, -1.1000]])
+
+    >>> torch.fft.fftshift(x)
+    tensor([[-2.2000, -1.2000, -0.2000,  0.8000,  1.8000],
+            [-2.1000, -1.1000, -0.1000,  0.9000,  1.9000],
+            [-2.0000, -1.0000,  0.0000,  1.0000,  2.0000],
+            [-1.9000, -0.9000,  0.1000,  1.1000,  2.1000],
+            [-1.8000, -0.8000,  0.2000,  1.2000,  2.2000]])
+
+""")
+
+ifftshift = _add_docstr(_fft.fft_ifftshift, r"""
+ifftshift(input, dim=None) -> Tensor
+
+Inverse of :func:`~torch.fft.fftshift`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(5)
+    >>> f
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    A round-trip through :func:`~torch.fft.fftshift` and
+    :func:`~torch.fft.ifftshift` gives the same result:
+
+    >>> shifted = torch.fftshift(f)
+    >>> torch.ifftshift(shifted)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+""")
diff --git a/torch/functional.py b/torch/functional.py
index 8eecf3643035..1a72aaf18a30 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -884,7 +884,7 @@ def tensordot(a, b, dims=2):
     Args:
       a (Tensor): Left tensor to contract
       b (Tensor): Right tensor to contract
-      dims (int or tuple of two lists of integers): number of dimensions to
+      dims (int or Tuple[List[int]] containing two lists): number of dimensions to
          contract or explicit lists of dimensions for :attr:`a` and
          :attr:`b` respectively
 
@@ -919,6 +919,12 @@ def tensordot(a, b, dims=2):
                 [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                 [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])
 
+        >>> a = torch.randn(3, 5, 4, 6)
+        >>> b = torch.randn(6, 4, 5, 3)
+        >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0]))
+        tensor([[  7.7193,  -2.4867, -10.3204],
+                [  1.5513, -14.4737,  -6.5113],
+                [ -0.2850,   4.2573,  -3.5997]])
     """
     if not torch.jit.is_scripting():
         if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 30e65c191a30..792a905432a5 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -85,6 +85,6 @@ def forward(self, x):
 
 from .graph_module import GraphModule
 from .symbolic_trace import symbolic_trace, Tracer
-from .graph import Graph, map_arg
-from .node import Node
+from .graph import Graph
+from .node import Node, map_arg
 from .proxy import Proxy
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
index 10a0c86e2249..0c5d18aa4fb2 100644
--- a/torch/fx/experimental/GraphManipulation.py
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -1,8 +1,8 @@
 from typing import Dict, List
 from torch.fx.graph_module import GraphModule
 from typing import Any
-from torch.fx.node import Node, Target
-from torch.fx.graph import Graph, map_arg
+from torch.fx.node import Node, Target, map_arg
+from torch.fx.graph import Graph
 
 
 """find_use is used to find out if the node is another node's arg or kwargs."""
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
index 01374727a447..52264796c7d4 100644
--- a/torch/fx/experimental/shape_prop.py
+++ b/torch/fx/experimental/shape_prop.py
@@ -15,7 +15,7 @@ def propagate(self, *args):
         env : Dict[str, Node] = {}
 
         def load_arg(a):
-            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+            return torch.fx.node.map_arg(a, lambda n: env[n.name])
 
         def fetch_attr(target : str):
             target_atoms = target.split('.')
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index f53f96db0174..ed7618372b57 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,4 +1,4 @@
-from .node import Node, Argument, Target
+from .node import Node, Argument, Target, map_arg
 
 from typing import Callable, Any, List, Dict, Optional, Tuple, Set
 import builtins
@@ -52,23 +52,22 @@ def _format_target(base: str, target: str) -> str:
             r = f'{r}.{e}'
     return r
 
-def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
-    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
-    if isinstance(a, (tuple, list)):
-        return type(a)(map_arg(elem, fn) for elem in a)
-    elif isinstance(a, dict):
-        return {k: map_arg(v, fn) for k, v in a.items()}
-    elif isinstance(a, slice):
-        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
-    elif isinstance(a, Node):
-        return fn(a)
-    else:
-        return a
+class insert_before:
+    def __init__(self, n : Node):
+        self.n = n
+
+    def __enter__(self):
+        self.orig_insert_point = self.n.graph._insert_point
+        self.n.graph._insert_point = self.n
+
+    def __exit__(self, type, value, tb):
+        self.n.graph._insert_point = self.orig_insert_point
 
 class Graph:
     def __init__(self):
         self._nodes : List[Node] = []
         self._used_names : Dict[str, int] = {}  # base name -> number
+        self._insert_point : Optional[Node] = None
 
     @property
     def nodes(self):
@@ -105,9 +104,38 @@ def create_node(self, op: str, target: Target,
         self._mark_uses(kwargs)
         sanitized_name = self._register_name_used(name) if name is not None else self._name(target)
         n = Node(self, sanitized_name, op, target, args, kwargs)
-        self._nodes.append(n)
+        if self._insert_point is not None:
+            before_idx = self._nodes.index(self._insert_point)
+            self._nodes.insert(before_idx, n)
+        else:
+            self._nodes.append(n)
         return n
 
+    def move_node_before(self, to_move : Node, before : Node):
+        """
+        Move node `to_move` before `before` in the Graph. Both `Node` arguments
+        must be present in this graph.
+        """
+        # TODO: Computationally inefficient
+        if to_move.graph != self or before.graph != self:
+            raise RuntimeError('Node arguments must belong to this Graph!')
+        node_idx = self._nodes.index(to_move)
+        before_idx = self._nodes.index(before)
+        self._nodes.insert(before_idx, self._nodes.pop(node_idx))
+
+
+    def erase_node(self, to_erase : Node):
+        """
+        Erases the node `to_erase` from the `Graph`. Throws an exception if
+        there are still uses of that node in the `Graph`.
+        """
+        if to_erase.uses > 0:
+            raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {to_erase.uses} uses in the graph!')
+
+        node_indices = [i for i, n in enumerate(self._nodes) if n == to_erase]
+        for idx in reversed(node_indices):
+            self._nodes.pop(idx)
+
     # sugar for above when you know the op
     def placeholder(self, name: str) -> Node:
         return self.create_node('placeholder', name)
@@ -202,6 +230,7 @@ def _register_name_used(self, op : str) -> str:
 
     def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
+        modules_used : Set[str] = set()
         body: List[str] = []
         for node in self._nodes:
             if node.op == 'placeholder':
@@ -225,6 +254,9 @@ def python_code(self, root_module: str) -> str:
                     body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n')
                     continue
                 qualified_name = _qualified_name(node.target)
+                if '.' in qualified_name:
+                    module_name = qualified_name.split('.', maxsplit=1)[0]
+                    modules_used.add(module_name)
                 if qualified_name == 'getattr' and \
                    isinstance(node.args, tuple) and \
                    isinstance(node.args[1], str) and \
@@ -247,9 +279,12 @@ def python_code(self, root_module: str) -> str:
                 continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
+        import_block = '\n'.join(f'import {name}' for name in sorted(modules_used))
+
         code = ''.join(body)
         code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
         fn_code = f"""\
+{import_block}
 def forward(self, {', '.join(free_vars)}):
 {code}
 """
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index bbc5c26a8182..6f72a29be184 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -28,9 +28,7 @@ def patched_getline(*args, **kwargs):
 linecache.getlines = patched_getline
 
 def _forward_from_src(src : str):
-    gbls: Dict[str, Any] = {
-        'torch': torch
-    }
+    gbls: Dict[str, Any] = {}
     exec_with_source(src, gbls)
     return gbls['forward']
 
@@ -107,7 +105,9 @@ class GraphModule(torch.nn.Module):
         forward : The Python method generated from `graph`
 
     Note that when `graph` is reassigned, `code` and `forward` will be automatically
-    regenerated.
+    regenerated. However, if you edit the contents of the `graph` without reassigning
+    the `graph` attribute itself, you must call `recompile()` to update the generated
+    code.
     """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
@@ -174,6 +174,14 @@ def graph(self):
     @graph.setter
     def graph(self, val) -> None:
         self._graph = val
+        self.recompile()
+
+    def recompile(self) -> None:
+        """
+        Recompile this GraphModule from its `graph` attribute. This should be
+        called after editing the contained `graph`, otherwise the generated
+        code of this `GraphModule` will be out of date.
+        """
         self.code = self._graph.python_code(root_module='self')
         cls = type(self)
         cls.forward = _forward_from_src(self.code)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 7bf57cff4dae..53abead5f044 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -5,7 +5,6 @@
 if TYPE_CHECKING:
     from .graph import Graph
 
-
 BaseArgumentTypes = Union[str, int, float, bool, torch.dtype, torch.Tensor]
 base_types = BaseArgumentTypes.__args__  # type: ignore
 
@@ -35,5 +34,63 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
         self.kwargs = kwargs
         self.uses = 0
 
+    def find_uses(self) -> List['Node']:
+        """
+        Find all nodes that use the value produced by `self`. The complexity of
+        this function is linear in the number of nodes * number of arguments to
+        each node.
+
+        Note that len(find_uses()) is not necessarily equal to attribute `uses`.
+        This node could be used multiple times in the same `Node`. In that case,
+        the user node would appear once in the return value here, but `uses` would
+        account for the total number of times this Node is used by the user node.
+        e.g. a node for `x + x` would have two uses for the `x` node, but the
+        `x + x` node would appear once in the return from `find_uses`
+        """
+        use_nodes : List[Node] = []
+        for node in self.graph._nodes:
+            def record_use(arg_node : Node) -> None:
+                if arg_node == self and (len(use_nodes) == 0 or use_nodes[-1] != node):
+                    use_nodes.append(node)
+            map_arg(node.args, record_use)
+            map_arg(node.kwargs, record_use)
+        return use_nodes
+
     def __repr__(self) -> str:
         return self.name
+
+    def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
+        """
+        Replace all uses of `self` in the Graph with the Node `replace_with`.
+        Returns the list of nodes on which this change was made.
+        """
+        use_nodes : List[Node] = self.find_uses()
+        for use_node in use_nodes:
+            def maybe_replace_node(n : Node) -> Node:
+                if n == self:
+                    self.uses -= 1
+                    return replace_with
+                else:
+                    return n
+            new_args = map_arg(use_node.args, maybe_replace_node)
+            assert isinstance(new_args, tuple)
+            use_node.args = new_args
+            new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
+            assert isinstance(new_kwargs, dict)
+            use_node.kwargs = new_kwargs
+
+        return use_nodes
+
+
+def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
+    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
+    if isinstance(a, (tuple, list)):
+        return type(a)(map_arg(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return {k: map_arg(v, fn) for k, v in a.items()}
+    elif isinstance(a, slice):
+        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
+    elif isinstance(a, Node):
+        return fn(a)
+    else:
+        return a
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index d9fce627e52d..81ceea5f58df 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -271,7 +271,7 @@ def get_enum_value_type(e: Type[enum.Enum], loc):
 
 def try_ann_to_type(ann, loc):
     if ann is None:
-        return TensorType.get()
+        return TensorType.getInferred()
     if inspect.isclass(ann) and issubclass(ann, torch.Tensor):
         return TensorType.get()
     if is_tuple(ann):
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 952f44161c3f..36dccd04b7e3 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -686,11 +686,10 @@ def build_SliceExpr(ctx, base, slice_expr):
             return SliceExpr(base.range(), lower, upper, step)
 
         def build_Index(ctx, base, index_expr):
-            if isinstance(index_expr.value, ast.Tuple) or \
-                    isinstance(index_expr.value, ast.List):
+            if isinstance(index_expr.value, ast.Tuple):
                 raise NotSupportedError(base.range(),
                                         "slicing multiple dimensions with "
-                                        "sequences not supported yet")
+                                        "tuples not supported yet")
             return build_expr(ctx, index_expr.value)
 
         def build_ExtSlice(ctx, base, extslice):
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 23c1731380f5..6e45b8594f9b 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -107,6 +107,13 @@ std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
   return deviceList;
 }
 
+std::string getKeySendRecv(int myRank, int peer) {
+  int lowRank = myRank < peer ? myRank : peer;
+  int highRank = myRank < peer ? peer : myRank;
+  std::string sendRecvPair = std::to_string(lowRank) + ":" + std::to_string(highRank);
+  return sendRecvPair;
+}
+
 // Get the list of devices from list of tensors
 std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
   std::vector<at::Device> res;
@@ -232,6 +239,7 @@ const int64_t ProcessGroupNCCL::kWorkCleanupThreadSleepMillis = 1000;
 constexpr int64_t kWaitForAbortCommStoreKey = 1000;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 const int64_t ProcessGroupNCCL::kProcessGroupNCCLOpTimeoutMillis = 10 * 1000;
+thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
     : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) {
@@ -450,6 +458,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       opTimeout_(options.opTimeout),
       futureNCCLCallbackStreams_(c10::cuda::device_count()),
       isHighPriorityStream_(options.isHighPriorityStream) {
+  TORCH_CHECK(at::cuda::getNumGPUs() != 0,
+    "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
   try {
     parseNcclBlockingWait();
   } catch (std::exception& e) {
@@ -715,7 +725,9 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId* ncclID) {
 
 std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     const std::string& devicesKey,
-    const std::vector<at::Device>& devices) {
+    const std::vector<at::Device>& devices,
+    NCCLCommType commType,
+    int p2pRank) {
   // Sanity check
   if (devicesKey.empty()) {
     throw std::runtime_error(
@@ -742,7 +754,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // Create the unique NCCL ID and broadcast it
   ncclUniqueId ncclID;
 
-  if (rank_ == 0) {
+  // For point-to-point communication, lower rank of the two will get unique id.
+  if (rank_ == 0 || (commType != NCCLCommType::COLL && p2pRank == 0)) {
     C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID));
   }
 
@@ -754,13 +767,41 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   std::vector<at::cuda::CUDAStream> streamVal;
   streamVal.reserve(devices.size());
 
-  // Create the NCCL communicators for each GPU
+  // [Group Start/End Note] This is used to ensure that nccl communicator will be created
+  // before communication primitives are called. Let's look at this example:
+  // Using the batch_isend_irecv to send a tensor to a target process. On the sender side,
+  // the corresponding underlying NCCL calls will look like
+  //   ncclGroupStart() // This is in batch_isend_irecv
+  //   ncclGroupStart() // This is [Note 1]
+  //   ncclCommInitRank() // Inside NCCLComm::create
+  //   ncclSend()
+  //   ncclGroupEnd() // This is [Note 2]
+  //   ncclGroupEnd() // This is in batch_isend_irecv
+  // With this pattern, the nccl communicator will be created in the last ncclGroupEnd
+  // which means when ncclSend is processed, the passed communicator argument is NULL which will
+  // lead to runtime error. So we need to "close" all active nccl groups to ensure
+  // nccl communicator is actually created before encountering any communication calls.
+  // This is why we need the following for loop.
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupEnd());
+  }
+
+  // [Note 1] Create the NCCL communicators for each GPU
   C10D_NCCL_CHECK(ncclGroupStart());
 
   for (size_t i = 0; i < devices.size(); ++i) {
     // GPU world size and GPU rank
-    int numRanks = getSize() * devices.size();
-    int rank = getRank() * devices.size() + i;
+    int numRanks, rank;
+
+    if (commType == NCCLCommType::COLL) {
+      numRanks = getSize() * devices.size();
+      rank = getRank() * devices.size() + i;
+    } else {
+    // For point-to-point operation, there are only 2 processes involved so
+    // the GPU rank is either 0 or 1.
+      numRanks = 2;
+      rank = p2pRank;
+    }
     // Get the device index
     int deviceIndex = devices[i].index();
 
@@ -779,8 +820,14 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     }
   }
 
+  // [Note 2 ]
   C10D_NCCL_CHECK(ncclGroupEnd());
 
+  // See [Group Start/End Note]
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupStart());
+  }
+
   ncclStreams_.emplace(devicesKey, std::move(streamVal));
 
   // Note: these events are created with the (default) cudaEventDisableTiming
@@ -1008,6 +1055,77 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
   return work;
 }
 
+template <typename Fn, typename PreProcess, typename PostProcess>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensors,
+    Fn fn,
+    int peer,
+    NCCLCommType commType,
+    PreProcess pre,
+    PostProcess post) {
+  const auto devices = getDeviceList(tensors);
+  const auto key = getKeySendRecv(rank_, peer);
+  int p2pRank = rank_ < peer ? 0 : 1;
+  auto& ncclComms = getNCCLComm(key, devices, commType, p2pRank);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStreams(devices, ncclEvents_[key], ncclStreams_[key]);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  auto work = initWork(devices);
+
+  if (commType == NCCLCommType::RECV) {
+    // Store references to outputs and futureNCCLCallbackStream to be used by
+    // WorkNCCL::getFuture.
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>(tensors);
+    work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_;
+  }
+
+  at::cuda::OptionalCUDAGuard gpuGuard;
+
+  pre(ncclStreams_[key]);
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+
+    // Both send tensor and recv tensor are created on a worker stream and used in
+    // different ncclStreams.  Hence, both must record the ncclStream to
+    // prevent being freed before the collective finishes.
+    //
+    // See [Sync Streams].
+    c10::cuda::CUDACachingAllocator::recordStream(
+        tensors[i].storage().data_ptr(), ncclStream);
+  }
+
+  {
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      gpuGuard.set_index(devices[i].index());
+      at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+      // For point-to-point communication, NCCL ranks can only
+      // be 0 or 1.
+      int p2pTargetRank = 1 - p2pRank;
+      C10D_NCCL_CHECK(
+          fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank));
+    }
+  }
+
+  post(ncclStreams_[key]);
+
+  // Event should only be recorded after the ncclGroupEnd()
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+    (*work->cudaEvents_)[i].record(ncclStream);
+    work->ncclComms_[i] = ncclComms[i];
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = opTimeout_;
+    work->store_ = store_;
+  }
+
+  return work;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
@@ -1021,6 +1139,21 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
       [](std::vector<at::cuda::CUDAStream>&) {});
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensor,
+    Fn fn,
+    int peer,
+    NCCLCommType type) {
+  return pointToPoint(
+      tensor,
+      fn,
+      peer,
+      type,
+      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&) {});
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
@@ -1296,6 +1429,54 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
         });
   }
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& tensors,
+    int dstRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret = pointToPoint(
+      tensors,
+      [&](at::Tensor& input,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream,
+          int dst) {
+        return ncclSend(
+            input.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            dst,
+            comm,
+            stream.stream());
+      },
+      dstRank,
+      NCCLCommType::SEND);
+  return ret;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& tensors,
+    int srcRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret= pointToPoint(
+      tensors,
+      [&](at::Tensor& output,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream,
+          int src) {
+        return ncclRecv(
+            output.data_ptr(),
+            output.numel(),
+            getNcclDataType(output.scalar_type()),
+            src,
+            comm,
+            stream.stream());
+      },
+      srcRank,
+      NCCLCommType::RECV);
+  return ret;
+}
 #else
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& /* unused */,
@@ -1306,7 +1487,37 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   throw std::runtime_error(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports recv for NCCL lib version >= 2.7.0");
+}
+#endif
+
+void ProcessGroupNCCL::groupStart() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupStart());
 #endif
+  ++ncclActiveGroupCounter_;
+}
+
+void ProcessGroupNCCL::groupEnd() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupEnd());
+#endif
+  --ncclActiveGroupCounter_;
+}
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& /* unused */,
@@ -1329,24 +1540,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
   throw std::runtime_error("ProcessGroupNCCL does not support scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support send");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
-}
-
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
+  throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_base(
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 06b144438455..b8b3d5aabd35 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -23,6 +23,13 @@ constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT";
 // Handling with NCCL.
 constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING";
 
+// NCCL Commmunication type
+enum class NCCLCommType : std::uint8_t {
+  SEND = 0,
+  RECV,
+  COLL,
+};
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -420,6 +427,20 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  std::shared_ptr<ProcessGroup::Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  static void groupStart();
+
+  static void groupEnd();
+
   // Unsupported Ops
   std::shared_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
@@ -431,16 +452,6 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) override;
-
-  std::shared_ptr<ProcessGroup::Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) override;
-
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
@@ -455,7 +466,9 @@ class ProcessGroupNCCL : public ProcessGroup {
   // a new set of NCCL communicators as a cache entry
   std::vector<std::shared_ptr<NCCLComm>>& getNCCLComm(
       const std::string& devicesKey,
-      const std::vector<at::Device>& devices);
+      const std::vector<at::Device>& devices,
+      NCCLCommType commType = NCCLCommType::COLL,
+      int p2pRank = 0);
 
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
@@ -484,6 +497,24 @@ class ProcessGroupNCCL : public ProcessGroup {
       PreProcess pre,
       PostProcess post);
 
+  // Helper that encapsulates work shared across point-to-point communication
+  // primitives. It is the same structure as the helper used for collective
+  // communicaiton primitives.
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      int peer,
+      NCCLCommType commType);
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      int peer,
+      NCCLCommType commType,
+      PreProcess pre,
+      PostProcess post);
+
   // Checks for NCCL errors on each of the communicators and returns an
   // appropriate exception_ptr (nullptr if no errors).
   static std::exception_ptr checkForNCCLErrorsInternal(
@@ -525,6 +556,8 @@ class ProcessGroupNCCL : public ProcessGroup {
   uint64_t ncclCommCounter_{0};
 
   // The NCCL communicator that the process group has cached.
+  //
+  // For collective operations:
   // The key is a list of GPU devices that an operation is operating on
   // The GPU devices are stored in a device sequence and the cache NCCL
   // communicator is associated with this GPU device sequence
@@ -543,6 +576,13 @@ class ProcessGroupNCCL : public ProcessGroup {
   //      "0,4,5,6,7,1,2,3"
   //
   //      Note that the order of the device for the tensor list matters.
+  //
+  // For point-to-point operations:
+  // The key is a string of my current rank and the peer process rank.
+  // e.g. If process 1 and process 2 are involved in a point-to-point communication,
+  // the key will be "1:2" on both processes.
+  // Note: this is for the scenario where there is only 1 GPU per process.
+  // When it comes to multiple GPUs per process, this part may need to redesigned.
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
       devNCCLCommMap_;
 
@@ -639,6 +679,11 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   // Schedule NCCL operations on high priority CUDA streams.
   bool isHighPriorityStream_ = false;
+
+  // The number of active ncclGroupStart() calls. This counter will be increased
+  // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
+  // is called.
+  static thread_local uint64_t ncclActiveGroupCounter_;
 };
 
 } // namespace c10d
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 30e732e6d859..2facc5e0c6eb 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -349,7 +349,7 @@ def add_module(self, name: str, module: Optional['Module']) -> None:
         elif hasattr(self, name) and name not in self._modules:
             raise KeyError("attribute '{}' already exists".format(name))
         elif '.' in name:
-            raise KeyError("module name can't contain \".\"")
+            raise KeyError("module name can't contain \".\", got: {}".format(name))
         elif name == '':
             raise KeyError("module name can't be empty string \"\"")
         self._modules[name] = module
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index b6997ca7701a..f063ffa2e8eb 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -186,11 +186,11 @@ class EmbeddingBag(Module):
     r"""Computes sums or means of 'bags' of embeddings, without instantiating the
     intermediate embeddings.
 
-    For bags of constant length and no :attr:`per_sample_weights`, this class
+    For bags of constant length and no :attr:`per_sample_weights` and 2D inputs, this class
 
-        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
-        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
-        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
 
     However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
     operations.
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 0c7e1f90f47a..852de812e39d 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -1,11 +1,9 @@
 from torch.fx import (
-    GraphModule
+    GraphModule,
+    map_arg
 )
 
-from torch.fx.graph import (
-    Graph,
-    map_arg,
-)
+from torch.fx.graph import Graph
 
 from .pattern_utils import (
     is_match,
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 77a3a47e9dc2..74dee6ea3cf3 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -3,12 +3,12 @@
     GraphModule,
     Proxy,
     symbolic_trace,
+    map_arg
 )
 
 from torch.fx.graph import (
     Graph,
     Node,
-    map_arg,
 )
 
 from torch.quantization import (
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index b2cd30c66812..4f36b31a23d0 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -130,6 +130,17 @@ def requires_mpi():
         "c10d was not compiled with the MPI backend",
     )
 
+def skip_if_rocm_single_process(func):
+    """Skips a test for ROCm in a single process environment"""
+    func.skip_if_rocm = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        raise unittest.SkipTest("Test skipped for ROCm")
+
+    return wrapper
 
 def skip_if_rocm(func):
     """Skips a test for ROCm"""
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 46ba17f61d8f..f26e6c75d37e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -373,6 +373,9 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                 active_if=(IS_MACOS or IS_WINDOWS)),
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                device_type='cuda', dtypes=[torch.float64],
+                                active_if=TEST_WITH_ROCM),
                    )),
     UnaryUfuncInfo('tanh',
                    ref=np.tanh,
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index ccbda8232952..468fd9cfdc81 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -653,12 +653,6 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        # numeric match for debug option for dynamic
-        # quantized op is not needed right now
-        if quant_type != QuantType.DYNAMIC:
-            self.assertEqual((result - result_debug).abs().max(), 0), \
-                'Expecting debug and non-debug option to produce identical result'
-
         qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 44caea6687f0..9c9d27bf195b 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -147,7 +147,7 @@ def _get_test_report_path():
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
-parser.add_argument('--ge_config', type=str)
+parser.add_argument('--jit_executor', type=str)
 parser.add_argument('--repeat', type=int, default=1)
 parser.add_argument('--test_bailouts', action='store_true')
 parser.add_argument('--save-xml', nargs='?', type=str,
@@ -158,11 +158,11 @@ def _get_test_report_path():
 parser.add_argument('--run-parallel', type=int, default=1)
 
 args, remaining = parser.parse_known_args()
-if args.ge_config == 'legacy':
+if args.jit_executor == 'legacy':
     GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif args.ge_config == 'profiling':
+elif args.jit_executor == 'profiling':
     GRAPH_EXECUTOR = ProfilingMode.PROFILING
-elif args.ge_config == 'simple':
+elif args.jit_executor == 'simple':
     GRAPH_EXECUTOR = ProfilingMode.SIMPLE
 else:
     # infer flags based on the default settings
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 01cddee92365..235e88f3c823 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -34,6 +34,7 @@
     skip_if_lt_x_gpu,
     skip_if_no_gpu,
     require_n_gpus_for_nccl_backend,
+    requires_nccl_version,
 )
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
@@ -210,10 +211,13 @@ def _lock():
             lf.close()
 
 
-def _build_tensor(size, value=None, dtype=torch.float):
+def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     if value is None:
         value = size
-    return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    if device_id is None:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    else:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
 
 
 def _build_multidim_tensor(dim, dim_size, value=None):
@@ -585,6 +589,182 @@ def test_backend_group(self):
         def test_backend_full_group(self):
             self._test_group_override_backend(self._init_full_group_test)
 
+        # NCCL Batch SEND RECV
+        @skip_if_no_gpu
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            p2p_op_list = []
+
+            for val in ["1", "0"]:
+                os.environ["NCCL_BLOCKING_WAIT"] = val
+                for src in range(0, dist.get_world_size()):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    recv_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                    p2p_op_list.append(recv_op)
+                    send_op = dist._P2POp(dist.isend, send_tensor, src)
+                    p2p_op_list.append(send_op)
+
+                reqs = dist._batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU with provided tags
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo_tags(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src, tag=src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src, tag=rank)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # NCCL Batch SEND RECV Tensor Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_tensor_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "Tensors must be CUDA and dense"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op = dist._P2POp(dist.isend, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Op Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``op``"
+                ):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    send_op = dist._P2POp(dist.broadcast, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV p2p_op_list Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_list_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``p2p_op_list``"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    req = dist._batch_isend_irecv([1, 2])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Mixed Backend Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_mixed_backend_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            group_gloo = dist.new_group(ranks=[0, 1], backend="gloo")
+            group_nccl = dist.new_group(ranks=[0, 1], backend="nccl")
+            if rank == 0:
+                with self.assertRaisesRegex(
+                    RuntimeError, "All groups need to use the same backend"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op_gloo = dist._P2POp(dist.isend, send_tensor, 1, group_gloo)
+                    send_op_nccl = dist._P2POp(dist.isend, send_tensor, 1, group_nccl)
+                    req = dist._batch_isend_irecv([send_op_gloo, send_op_nccl])
+                    req.wait()
+
+        # NCCL SEND RECV
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @skip_if_no_gpu
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_send_recv_nccl(self):
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+
+            tensor = _build_tensor(rank + 1, device_id=device_id)
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    # Send mode
+                    for dst in range(0, dist.get_world_size()):
+                        if dst == rank:
+                            continue
+                        dist.send(tensor, dst)
+                else:
+                    # Recv mode
+                    expected_tensor = _build_tensor(src + 1)
+                    output_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    dist.recv(output_tensor, src)
+                    self.assertEqual(output_tensor, expected_tensor)
+
+            self._barrier()
+
         # SEND RECV
         @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
         def test_send_recv(self):