diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 21b6eebef5a1..7c81ad2f78ae 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -36,6 +36,7 @@ def get_processor_arch_name(gpu_version):
         "3.6m",
         "3.7m",
         "3.8m",
+        "3.9m"
     ],
     conda=dimensions.STANDARD_PYTHON_VERSIONS,
     libtorch=[
@@ -43,11 +44,18 @@ def get_processor_arch_name(gpu_version):
     ],
 )
 
+# TODO: There's an issue with current Python 3.9 builds that only occurs during
+#       windows builds, let's just not build 3.9 for windows and figure out how
+#       to resolve afterwards
+PYTHON_VERSIONS_NO_39 = [
+    v for v in dimensions.STANDARD_PYTHON_VERSIONS if v not in ['3.9']
+]
+
 CONFIG_TREE_DATA = OrderedDict(
     linux=(dimensions.GPU_VERSIONS, LINUX_PACKAGE_VARIANTS),
     macos=([None], OrderedDict(
-        wheel=dimensions.STANDARD_PYTHON_VERSIONS,
-        conda=dimensions.STANDARD_PYTHON_VERSIONS,
+        wheel=PYTHON_VERSIONS_NO_39,
+        conda=PYTHON_VERSIONS_NO_39,
         libtorch=[
             "3.7",
         ],
@@ -56,8 +64,8 @@ def get_processor_arch_name(gpu_version):
     windows=(
         [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
         OrderedDict(
-            wheel=dimensions.STANDARD_PYTHON_VERSIONS,
-            conda=dimensions.STANDARD_PYTHON_VERSIONS,
+            wheel=PYTHON_VERSIONS_NO_39,
+            conda=PYTHON_VERSIONS_NO_39,
             libtorch=[
                 "3.7",
             ],
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index 270c96498d39..57489ebe7915 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -20,4 +20,5 @@
     "3.6",
     "3.7",
     "3.8",
+    "3.9"
 ]
diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index ef1d7e8d1a70..91f757207915 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -31,6 +31,7 @@
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
     "pytorch-linux-bionic-rocm3.8-py3.6",
+    "pytorch-linux-bionic-rocm3.9-py3.6",
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1d4f7de1faa4..b5144dc703ea 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1985,6 +1985,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda102"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m cpu devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-cuda102"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m cu92 devtoolset7"
@@ -2018,6 +2029,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda92"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m cu92 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-cuda92"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m cu101 devtoolset7"
@@ -2051,6 +2073,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda101"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m cu101 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-cuda101"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m cu102 devtoolset7"
@@ -2084,6 +2117,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda102"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m cu102 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-cuda102"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m cu110 devtoolset7"
@@ -2117,6 +2161,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-cuda110"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m cu110 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-cuda110"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
@@ -2150,6 +2205,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
       - binary_linux_build:
           name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build
           build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
@@ -2183,6 +2249,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-rocm:3.9"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.9"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -2216,6 +2293,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_build:
+          name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_build
+          build_environment: "conda 3.9 cpu devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_build
           build_environment: "conda 3.6 cu92 devtoolset7"
@@ -2249,6 +2337,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_build:
+          name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_build
+          build_environment: "conda 3.9 cu92 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_build
           build_environment: "conda 3.6 cu101 devtoolset7"
@@ -2282,6 +2381,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_build:
+          name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_build
+          build_environment: "conda 3.9 cu101 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_build
           build_environment: "conda 3.6 cu102 devtoolset7"
@@ -2315,6 +2425,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_build:
+          name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_build
+          build_environment: "conda 3.9 cu102 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_build
           build_environment: "conda 3.6 cu110 devtoolset7"
@@ -2348,6 +2469,17 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_build:
+          name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_build
+          build_environment: "conda 3.9 cu110 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_build:
           name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_build
           build_environment: "libtorch 3.7m cpu devtoolset7"
@@ -3257,6 +3389,19 @@ workflows:
           requires:
             - binary_linux_manywheel_3_8m_cpu_devtoolset7_nightly_build
           docker_image: "pytorch/manylinux-cuda102"
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m cpu devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-cuda102"
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m cu92 devtoolset7"
@@ -3302,6 +3447,21 @@ workflows:
           docker_image: "pytorch/manylinux-cuda92"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m cu92 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-cuda92"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m cu101 devtoolset7"
@@ -3347,6 +3507,21 @@ workflows:
           docker_image: "pytorch/manylinux-cuda101"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m cu101 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-cuda101"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m cu102 devtoolset7"
@@ -3392,6 +3567,21 @@ workflows:
           docker_image: "pytorch/manylinux-cuda102"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m cu102 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-cuda102"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m cu110 devtoolset7"
@@ -3437,6 +3627,21 @@ workflows:
           docker_image: "pytorch/manylinux-cuda110"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m cu110 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-cuda110"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
@@ -3482,6 +3687,21 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.8"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test
           build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
@@ -3527,6 +3747,21 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.9"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3566,6 +3801,19 @@ workflows:
           requires:
             - binary_linux_conda_3_8_cpu_devtoolset7_nightly_build
           docker_image: "pytorch/conda-cuda"
+      - binary_linux_test:
+          name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_test
+          build_environment: "conda 3.9 cpu devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_conda_3_9_cpu_devtoolset7_nightly_build
+          docker_image: "pytorch/conda-cuda"
       - binary_linux_test:
           name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_test
           build_environment: "conda 3.6 cu92 devtoolset7"
@@ -3611,6 +3859,21 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_test
+          build_environment: "conda 3.9 cu92 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_conda_3_9_cu92_devtoolset7_nightly_build
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_test
           build_environment: "conda 3.6 cu101 devtoolset7"
@@ -3656,6 +3919,21 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_test
+          build_environment: "conda 3.9 cu101 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_conda_3_9_cu101_devtoolset7_nightly_build
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_test
           build_environment: "conda 3.6 cu102 devtoolset7"
@@ -3701,6 +3979,21 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_test
+          build_environment: "conda 3.9 cu102 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_conda_3_9_cu102_devtoolset7_nightly_build
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_test
           build_environment: "conda 3.6 cu110 devtoolset7"
@@ -3746,6 +4039,21 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_test
+          build_environment: "conda 3.9 cu110 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_conda_3_9_cu110_devtoolset7_nightly_build
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_test
           build_environment: "libtorch 3.7m cpu devtoolset7"
@@ -4820,6 +5128,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: cpu
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: cpu
       - binary_upload:
           name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_upload
           context: org-member
@@ -4862,6 +5184,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: cu92
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: cu92
       - binary_upload:
           name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_upload
           context: org-member
@@ -4904,6 +5240,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: cu101
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: cu101
       - binary_upload:
           name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_upload
           context: org-member
@@ -4946,6 +5296,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: cu102
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: cu102
       - binary_upload:
           name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_upload
           context: org-member
@@ -4988,6 +5352,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: cu110
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: cu110
       - binary_upload:
           name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
           context: org-member
@@ -5030,6 +5408,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
       - binary_upload:
           name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_upload
           context: org-member
@@ -5072,6 +5464,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: rocm3.9
+      - binary_upload:
+          name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.9
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -5114,6 +5520,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: conda
           upload_subfolder: cpu
+      - binary_upload:
+          name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_conda_3_9_cpu_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: conda
+          upload_subfolder: cpu
       - binary_upload:
           name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_upload
           context: org-member
@@ -5156,6 +5576,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: conda
           upload_subfolder: cu92
+      - binary_upload:
+          name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_conda_3_9_cu92_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: conda
+          upload_subfolder: cu92
       - binary_upload:
           name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_upload
           context: org-member
@@ -5198,6 +5632,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: conda
           upload_subfolder: cu101
+      - binary_upload:
+          name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_conda_3_9_cu101_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: conda
+          upload_subfolder: cu101
       - binary_upload:
           name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_upload
           context: org-member
@@ -5240,6 +5688,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: conda
           upload_subfolder: cu102
+      - binary_upload:
+          name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_conda_3_9_cu102_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: conda
+          upload_subfolder: cu102
       - binary_upload:
           name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_upload
           context: org-member
@@ -5282,6 +5744,20 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: conda
           upload_subfolder: cu110
+      - binary_upload:
+          name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_conda_3_9_cu110_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: conda
+          upload_subfolder: cu110
       - binary_upload:
           name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_upload
           context: org-member
@@ -6471,6 +6947,9 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.9-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.9-py3.6"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc5_4_build
           requires:
@@ -7450,6 +7929,16 @@ workflows:
               only:
                 - postnightly
           docker_image: "pytorch/manylinux-cuda102"
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_cpu_devtoolset7_nightly
+          build_environment: "manywheel 3.9m cpu devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-cuda102"
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_cu92_devtoolset7_nightly
           build_environment: "manywheel 3.6m cu92 devtoolset7"
@@ -7486,6 +7975,18 @@ workflows:
           docker_image: "pytorch/manylinux-cuda92"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_cu92_devtoolset7_nightly
+          build_environment: "manywheel 3.9m cu92 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-cuda92"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_cu101_devtoolset7_nightly
           build_environment: "manywheel 3.6m cu101 devtoolset7"
@@ -7522,6 +8023,18 @@ workflows:
           docker_image: "pytorch/manylinux-cuda101"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_cu101_devtoolset7_nightly
+          build_environment: "manywheel 3.9m cu101 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-cuda101"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_cu102_devtoolset7_nightly
           build_environment: "manywheel 3.6m cu102 devtoolset7"
@@ -7558,6 +8071,18 @@ workflows:
           docker_image: "pytorch/manylinux-cuda102"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_cu102_devtoolset7_nightly
+          build_environment: "manywheel 3.9m cu102 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-cuda102"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_cu110_devtoolset7_nightly
           build_environment: "manywheel 3.6m cu110 devtoolset7"
@@ -7594,6 +8119,18 @@ workflows:
           docker_image: "pytorch/manylinux-cuda110"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_cu110_devtoolset7_nightly
+          build_environment: "manywheel 3.9m cu110 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-cuda110"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
           build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
@@ -7630,6 +8167,18 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.8"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly
           build_environment: "manywheel 3.6m rocm3.9 devtoolset7"
@@ -7666,6 +8215,18 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.9"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly
+          build_environment: "manywheel 3.9m rocm3.9 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.9"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -7696,6 +8257,16 @@ workflows:
               only:
                 - postnightly
           docker_image: "pytorch/conda-cuda"
+      - smoke_linux_test:
+          name: smoke_linux_conda_3_9_cpu_devtoolset7_nightly
+          build_environment: "conda 3.9 cpu devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/conda-cuda"
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cu92_devtoolset7_nightly
           build_environment: "conda 3.6 cu92 devtoolset7"
@@ -7732,6 +8303,18 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_conda_3_9_cu92_devtoolset7_nightly
+          build_environment: "conda 3.9 cu92 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cu101_devtoolset7_nightly
           build_environment: "conda 3.6 cu101 devtoolset7"
@@ -7768,6 +8351,18 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_conda_3_9_cu101_devtoolset7_nightly
+          build_environment: "conda 3.9 cu101 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cu102_devtoolset7_nightly
           build_environment: "conda 3.6 cu102 devtoolset7"
@@ -7804,6 +8399,18 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_conda_3_9_cu102_devtoolset7_nightly
+          build_environment: "conda 3.9 cu102 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cu110_devtoolset7_nightly
           build_environment: "conda 3.6 cu110 devtoolset7"
@@ -7840,6 +8447,18 @@ workflows:
           docker_image: "pytorch/conda-cuda"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_conda_3_9_cu110_devtoolset7_nightly
+          build_environment: "conda 3.9 cu110 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/conda-cuda"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps
           build_environment: "libtorch 3.7m cpu devtoolset7"
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 7353bc18c221..019c7f6e9d1c 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -288,6 +288,13 @@ case "$image" in
     VISION=yes
     ROCM_VERSION=3.8
     ;;
+  pytorch-linux-bionic-rocm3.9-py3.6)
+    ANACONDA_PYTHON_VERSION=3.6
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=3.9
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index c63e28029f07..b7ad26f44836 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -98,11 +98,28 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 
   # Install some other packages, including those needed for Python test reporting
   # TODO: Why is scipy pinned
-  # numba & llvmlite is pinned because of https://github.com/numba/numba/issues/4368
-  # scikit-learn is pinned because of
-  # https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5
-  # only)
-  as_jenkins pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0 unittest-xml-reporting coverage
+  # Pin MyPy version because new errors are likely to appear with each release
+  # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
+  as_jenkins pip install --progress-bar off pytest \
+    scipy==1.1.0 \
+    scikit-image \
+    librosa>=0.6.2 \
+    psutil \
+    numba \
+    llvmlite \
+    unittest-xml-reporting \
+    coverage \
+    hypothesis==4.53.2 \
+    mypy==0.770 \
+    tb-nightly
+
+  # Update scikit-learn to a python-3.8 compatible version
+  if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
+    as_jenkins pip install --progress-bar off -U scikit-learn
+  else
+    # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only)
+    as_jenkins pip install --progress-bar off scikit-learn==0.20.3
+  fi
 
   popd
 fi
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index b0d7385d07ee..6be3a0ddefc7 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -5,12 +5,13 @@ cat >/home/circleci/project/ci_test_script.sh <<EOL
 # =================== The following code will be executed inside Docker container ===================
 set -eux -o pipefail
 
+python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"
+
 # Set up Python
 if [[ "$PACKAGE_TYPE" == conda ]]; then
   retry conda create -qyn testenv python="$DESIRED_PYTHON"
   source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)"
   python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
   # Prior to Python 3.8 paths were suffixed with an 'm'
   if [[ -d  "\${python_path}/bin" ]]; then
@@ -20,6 +21,11 @@ elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
   fi
 fi
 
+EXTRA_CONDA_FLAGS=""
+if [[ "\$python_nodot" = *39* ]]; then
+  EXTRA_CONDA_FLAGS="-c=conda-forge"
+fi
+
 # Install the package
 # These network calls should not have 'retry's because they are installing
 # locally and aren't actually network calls
@@ -28,11 +34,11 @@ fi
 #   conda build scripts themselves. These should really be consolidated
 pkg="/final_pkgs/\$(ls /final_pkgs)"
 if [[ "$PACKAGE_TYPE" == conda ]]; then
-  conda install -y "\$pkg" --offline
+  conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
   if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
-    retry conda install -y cpuonly -c pytorch
+    retry conda install \${EXTRA_CONDA_FLAGS} -y cpuonly -c pytorch
   fi
-  retry conda install -yq future numpy protobuf six
+  retry conda install \${EXTRA_CONDA_FLAGS} -yq future numpy protobuf six
   if [[ "$DESIRED_CUDA" != 'cpu' ]]; then
     # DESIRED_CUDA is in format cu90 or cu102
     if [[ "${#DESIRED_CUDA}" == 4 ]]; then
@@ -40,7 +46,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
     else
       cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
     fi
-    retry conda install -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
+    retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
   fi
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
   pip install "\$pkg"
diff --git a/.gitignore b/.gitignore
index e908b405a662..3d2e85be977f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,8 @@ docs/cpp/src
 docs/src/**/*
 docs/cpp/build
 docs/cpp/source/api
+docs/cpp/source/html/
+docs/cpp/source/latex/
 docs/source/generated/
 log
 test/.coverage
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index bba8aa0e0365..3efaf448e4ab 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -161,6 +161,11 @@ if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
   export PATH="/usr/local/cuda/bin:$PATH"
 fi
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
+  if [[ -n "$IN_CI" ]]; then
+      # Set ROCM_ARCH to gfx900 and gfx906 for CI builds
+      echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds"
+      export PYTORCH_ROCM_ARCH="gfx900;gfx906"
+  fi
   # This is needed to enable ImageInput operator in resnet50_trainer
   build_args+=("USE_OPENCV=ON")
   # This is needed to read datasets from https://download.caffe2.ai/databases/resnet_trainer.zip
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 9656ec338fe7..9fd031f49907 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -88,20 +88,16 @@ if [[ "$PIP_USER" = root ]]; then
   MAYBE_SUDO=sudo
 fi
 
-# if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then
-  # Hotfix, use hypothesis 3.44.6 on Ubuntu 14.04
-  # See comments on
-  # https://github.com/HypothesisWorks/hypothesis-python/commit/eadd62e467d6cee6216e71b391951ec25b4f5830
-  $MAYBE_SUDO pip -q uninstall -y hypothesis
-  $MAYBE_SUDO pip -q uninstall -y coverage
-  # "pip install hypothesis==3.44.6" from official server is unreliable on
-  # CircleCI, so we host a copy on S3 instead
-  $MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl
-  $MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl
-  $MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl
-# else
-#   pip install --user --no-cache-dir hypothesis==3.59.0
-# fi
+# Uninstall pre-installed hypothesis and coverage to use an older version as newer
+# versions remove the timeout parameter from settings which ideep/conv_transpose_test.py uses
+$MAYBE_SUDO pip -q uninstall -y hypothesis
+$MAYBE_SUDO pip -q uninstall -y coverage
+
+# "pip install hypothesis==3.44.6" from official server is unreliable on
+# CircleCI, so we host a copy on S3 instead
+$MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl
+$MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl
+$MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl
 
 # Collect additional tests to run (outside caffe2/python)
 EXTRA_TESTS=()
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index b94e797e7010..b6e21c363133 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -151,8 +151,8 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   fi
 
   if [[ -n "$IN_CI" ]]; then
-      # Set ROCM_ARCH to gtx900 and gtx906 in CircleCI
-      echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CircleCI builds"
+      # Set ROCM_ARCH to gfx900 and gfx906 for CI builds
+      echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds"
       export PYTORCH_ROCM_ARCH="gfx900;gfx906"
   fi
 
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 7747324676b5..96e3f5d8ede1 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -140,4 +140,3 @@ fi
 retry () {
   $*  || (sleep 1 && $*) || (sleep 2 && $*)
 }
-
diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
index fb5e6f54d013..c61da6af707f 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -17,7 +17,6 @@ fi
 
 python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
-time python test/run_test.py --verbose -i distributed/test_distributed_fork
 time python test/run_test.py --verbose -i distributed/test_c10d
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn
 assert_git_not_dirty
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 88bcfc93e19d..78ba67c088ee 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -32,18 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *ppc64le* ]] && [[ "$BUILD_ENVIRONMENT" != *-bazel
   # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
   # but this script should be runnable by any user, including root
   export PATH="$HOME/.local/bin:$PATH"
-
-  # TODO: Please move this to Docker
-  # The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
-  pip_install --user "hypothesis==4.53.2"
-  # Pin MyPy version because new errors are likely to appear with each release
-  pip_install --user "mypy==0.770"
-  # Update scikit-learn to a python-3.8 compatible version
-  if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
-    pip_install -U scikit-learn
-  fi
-
-  pip_install --user tb-nightly
 fi
 
 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0bac4993d1b6..2a0dd1c75974 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -722,6 +722,8 @@ endif()
 if(ANDROID AND (NOT ANDROID_DEBUG_SYMBOLS))
   if(CMAKE_COMPILER_IS_GNUCXX)
     string(APPEND CMAKE_CXX_FLAGS " -s")
+  elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    string(APPEND CMAKE_CXX_FLAGS " -g0")
   else()
     string(APPEND CMAKE_EXE_LINKER_FLAGS " -s")
   endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a1b4096592a7..6593e35e4cf9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -313,6 +313,8 @@ pip install -r requirements.txt
 # npm install -g katex
 # Or if you prefer an uncontaminated global executable environment or do not want to go through the node configuration:
 # npm install katex && export PATH="$PATH:$(pwd)/node_modules/.bin"
+# If you're a Facebook employee using a devserver, yarn may be more convenient:
+# yarn global add katex
 ```
 
 3. Generate the documentation HTML files. The generated files will be in `docs/build/html`.
@@ -353,7 +355,7 @@ information on the documentation syntax.
 
 We run Doxygen in CI (Travis) to verify that you do not use invalid Doxygen
 commands. To run this check locally, run `./check-doxygen.sh` from inside
-`docs/cpp`.
+`docs/cpp/source`.
 
 To build the documentation, follow the same steps as above, but run them from
 `docs/cpp` instead of `docs`.
@@ -378,6 +380,14 @@ et my_machine -t="8000:8000"
 
 Then navigate to `localhost:8000` in your web browser.
 
+Alternatively, you can run `rsync` on your local machine to copy the files from
+your remote machine:
+```bash
+mkdir -p build cpp/build
+rsync -az me@my_machine:/path/to/pytorch/docs/build/html build
+rsync -az me@my_machine:/path/to/pytorch/docs/cpp/build/html cpp/build
+```
+
 #### Submitting changes for review
 
 It is helpful when submitting a PR that changes the docs to provide a rendered
diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
index 87517afe27c6..8ba50db14a2b 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
@@ -110,50 +110,92 @@ OperatorHandle makeDummyOperatorHandle() {
 // boxed kernels that return refs to tensor arguments, a la inplace/outplace kernels
 //
 
-void boxed_func_with_tensor_ref_return(const OperatorHandle& /*opHandle*/, Stack* stack) {
+void boxed_func_for_inplace_op(const OperatorHandle& /*opHandle*/, Stack* stack) {
   // (Tensor(a!), Scalar) -> Tensor(a!)
   EXPECT_EQ(2, stack->size());
 
   ASSERT_TRUE(stack->at(0).isTensor());
-  auto a = stack->at(0).toTensor();
+  auto t = stack->at(0).toTensor();
 
   ASSERT_TRUE(stack->at(1).isScalar());
-  auto b = stack->at(1).toScalar();
+  auto s = stack->at(1).toScalar();
 
-  a.add_(b);
+  t.add_(s);
 
   stack->clear();
-  torch::jit::push(stack, a);
+  torch::jit::push(stack, t);
 }
 
-void boxed_func_with_multiple_tensor_ref_return(const OperatorHandle& /*opHandle*/, Stack* stack) {
+void boxed_func_for_outofplace_op(const OperatorHandle& /*opHandle*/, Stack* stack) {
+  // (Scalar, Tensor(a!)) -> Tensor(a!)
+  EXPECT_EQ(2, stack->size());
+
+  ASSERT_TRUE(stack->at(0).isScalar());
+  auto s = stack->at(0).toScalar();
+
+  ASSERT_TRUE(stack->at(1).isTensor());
+  auto t = stack->at(1).toTensor();
+
+  t.add_(s);
+
+  stack->clear();
+  torch::jit::push(stack, t);
+}
+
+void boxed_func_for_outofplace_multi_op(const OperatorHandle& /*opHandle*/, Stack* stack) {
   // (Tensor(a!), Tensor(b!), Scalar, Scalar) -> (Tensor(a!), Tensor(b!))
   EXPECT_EQ(4, stack->size());
 
   ASSERT_TRUE(stack->at(0).isTensor());
-  auto a = stack->at(0).toTensor();
+  auto t1 = stack->at(0).toTensor();
 
   ASSERT_TRUE(stack->at(1).isTensor());
-  auto b = stack->at(1).toTensor();
+  auto t2 = stack->at(1).toTensor();
 
   ASSERT_TRUE(stack->at(2).isScalar());
-  auto c = stack->at(2).toScalar();
+  auto s1 = stack->at(2).toScalar();
 
   ASSERT_TRUE(stack->at(3).isScalar());
-  auto d = stack->at(3).toScalar();
+  auto s2 = stack->at(3).toScalar();
+
+  t1.add_(s1);
+  t2.add_(s2);
+
+  stack->clear();
+  torch::jit::push(stack, t1);
+  torch::jit::push(stack, t2);
+}
+
+void boxed_func_for_legacy_outofplace_multi_op(const OperatorHandle& /*opHandle*/, Stack* stack) {
+  // (Scalar, Scalar, Tensor(a!), Tensor(b!)) -> (Tensor(a!), Tensor(b!))
+  EXPECT_EQ(4, stack->size());
+
+  ASSERT_TRUE(stack->at(0).isScalar());
+  auto s1 = stack->at(0).toScalar();
+
+  ASSERT_TRUE(stack->at(1).isScalar());
+  auto s2 = stack->at(1).toScalar();
+
+  ASSERT_TRUE(stack->at(2).isTensor());
+  auto t1 = stack->at(2).toTensor();
+
+  ASSERT_TRUE(stack->at(3).isTensor());
+  auto t2 = stack->at(3).toTensor();
 
-  a.add_(c);
-  b.add_(d);
+  t1.add_(s1);
+  t2.add_(s2);
 
   stack->clear();
-  torch::jit::push(stack, a);
-  torch::jit::push(stack, b);
+  torch::jit::push(stack, t1);
+  torch::jit::push(stack, t2);
 }
 
 //
 // boxed calling tests:
 //
 
+// functional
+
 void expectBoxedCallingWithReturnWorks(const KernelFunction& func) {
   called_with_args = c10::nullopt;
   vector<IValue> stack {3, 4};
@@ -198,50 +240,76 @@ void expectBoxedCallingWithMultiReturnWorks(const KernelFunction& func) {
   EXPECT_EQ(12, stack[1].toInt());
 }
 
-void expectBoxedCallingWithTensorRefReturnWorks(const KernelFunction& func) {
-  OperatorHandle dummy = makeDummyOperatorHandle();
+// in/out
 
-  auto a = at::zeros({1});
-  auto b = 1.0f;
-  vector<IValue> stack {a, b};
+void expectInPlaceBoxedCallingWorks(const KernelFunction& func) {
+  OperatorHandle dummy = makeDummyOperatorHandle();
 
+  auto t = at::zeros({1});
+  auto s = 1.0f;
+  vector<IValue> stack {t, s};
   func.callBoxed(dummy, &stack);
 
-  // kernel should have updated arg 0
-  EXPECT_EQ(a.item().toFloat(), 1.0f);
-
-  // and returned it on the stack
+  // kernel should have updated out arg and returned it
+  EXPECT_EQ(t.item().toFloat(), 1.0f);
   EXPECT_EQ(1, stack.size());
   EXPECT_TRUE(stack[0].isTensor());
-  auto t = stack[0].toTensor();
-  EXPECT_EQ(t.item().toFloat(), 1.0f);
+  EXPECT_TRUE(stack[0].toTensor().is_same(t));
 }
 
-void expectBoxedCallingWithMultipleTensorRefReturnWorks(const KernelFunction& func) {
+void expectOutOfPlaceBoxedCallingWorks(const KernelFunction& func) {
   OperatorHandle dummy = makeDummyOperatorHandle();
 
-  auto a = at::zeros({1});
-  auto b = at::zeros({1});
-  auto c = 1.0f;
-  auto d = 2.0f;
-  vector<IValue> stack {a, b, c, d};
-
+  auto s = 1.0f;
+  auto t = at::zeros({1});
+  vector<IValue> stack {s, t};
   func.callBoxed(dummy, &stack);
 
-  // kernel should have updated args 0 and 1
-  EXPECT_EQ(a.item().toFloat(), 1.0f);
-  EXPECT_EQ(b.item().toFloat(), 2.0f);
+  // kernel should have updated out arg and returned it on the stack
+  EXPECT_EQ(t.item().toFloat(), 1.0f);
+  EXPECT_EQ(1, stack.size());
+  EXPECT_TRUE(stack[0].isTensor());
+  EXPECT_TRUE(stack[0].toTensor().is_same(t));
+}
 
-  // and pushed them onto the stack
-  EXPECT_EQ(2, stack.size());
+void expectOutOfPlaceMultiBoxedCallingWorks(const KernelFunction& func) {
+  OperatorHandle dummy = makeDummyOperatorHandle();
+
+  auto t1 = at::zeros({1});
+  auto t2 = at::zeros({1});
+  auto s1 = 1.0f;
+  auto s2 = 2.0f;
+  vector<IValue> stack {t1, t2, s1, s2};
+  func.callBoxed(dummy, &stack);
 
+  // kernel should have updated output args and returned them on the stack
+  EXPECT_EQ(t1.item().toFloat(), 1.0f);
+  EXPECT_EQ(t2.item().toFloat(), 2.0f);
+  EXPECT_EQ(2, stack.size());
   EXPECT_TRUE(stack[0].isTensor());
-  auto ta = stack[0].toTensor();
-  EXPECT_EQ(ta.item().toFloat(), 1.0f);
+  EXPECT_TRUE(stack[0].toTensor().is_same(t1));
+  EXPECT_TRUE(stack[1].isTensor());
+  EXPECT_TRUE(stack[1].toTensor().is_same(t2));
+}
 
+void expectLegacyOutOfPlaceMultiBoxedCallingWorks(const KernelFunction& func) {
+  OperatorHandle dummy = makeDummyOperatorHandle();
+
+  auto s1 = 1.0f;
+  auto s2 = 2.0f;
+  auto t1 = at::zeros({1});
+  auto t2 = at::zeros({1});
+  vector<IValue> stack {s1, s2, t1, t2};
+  func.callBoxed(dummy, &stack);
+
+  // kernel should have updated output args and returned them on the stack
+  EXPECT_EQ(t1.item().toFloat(), 1.0f);
+  EXPECT_EQ(t2.item().toFloat(), 2.0f);
+  EXPECT_EQ(2, stack.size());
+  EXPECT_TRUE(stack[0].isTensor());
+  EXPECT_TRUE(stack[0].toTensor().is_same(t1));
   EXPECT_TRUE(stack[1].isTensor());
-  auto tb = stack[1].toTensor();
-  EXPECT_EQ(tb.item().toFloat(), 2.0f);
+  EXPECT_TRUE(stack[1].toTensor().is_same(t2));
 }
 
 void expectBoxedCallingFailsWith(const KernelFunction& func, const char* errorMessage) {
@@ -254,6 +322,12 @@ void expectBoxedCallingFailsWith(const KernelFunction& func, const char* errorMe
   }, errorMessage);
 }
 
+//
+// unboxed calling tests:
+//
+
+// functional
+
 // make an unboxed call to a kernel that returns a single value.
 //
 void expectUnboxedCallingWithReturnWorks(const KernelFunction& func) {
@@ -294,57 +368,84 @@ void expectUnboxedCallingWithMultiReturnWorks(const KernelFunction& func) {
   EXPECT_EQ((tuple<int64_t, int64_t>(7, 12)), result);
 }
 
-// make an unboxed call to a kernel that modifies its first (Tensor) argument
-// and returns a reference to it.
-//
-void expectUnboxedCallingWithTensorRefReturnWorks(const KernelFunction& func) {
+// in/out
+
+void expectInPlaceUnboxedCallingWorks(const KernelFunction& func) {
   OperatorHandle dummy = makeDummyOperatorHandle();
 
-  auto a = at::zeros({1});
+  auto t = at::zeros({1});
+  at::Tensor& t_out = func.call<at::Tensor&, at::Tensor&, at::Scalar>(dummy, t, 1.0f);
 
-  at::Tensor& t = func.call<at::Tensor&, at::Tensor&, at::Scalar>(dummy, a, 1.0f);
+  // should have updated first arg and returned it
+  EXPECT_EQ(t.item().toFloat(), 1.0f);
+  EXPECT_EQ(&t, &t_out);
+}
+
+void expectOutOfPlaceUnboxedCallingWorks(const KernelFunction& func) {
+  OperatorHandle dummy = makeDummyOperatorHandle();
 
-  EXPECT_EQ(a.item().toFloat(), 1.0f);
+  auto t = at::zeros({1});
+  at::Tensor& t_out = func.call<at::Tensor&, at::Scalar, at::Tensor&>(dummy, 1.0f, t);
+
+  // should have updated out arg and returned it
   EXPECT_EQ(t.item().toFloat(), 1.0f);
+  EXPECT_EQ(&t, &t_out);
+}
+
+void expectOutOfPlaceMultiUnboxedCallingWorks(const KernelFunction& func) {
+  OperatorHandle dummy = makeDummyOperatorHandle();
 
-  EXPECT_EQ(&a, &t);
+  auto t1 = at::zeros({1});
+  auto t2 = at::zeros({1});
+  auto s1 = 1.0f;
+  auto s2 = 2.0f;
+
+  std::tuple<at::Tensor&, at::Tensor&> tup = func.call<
+    std::tuple<at::Tensor&, at::Tensor&>, at::Tensor&, at::Tensor&, at::Scalar, at::Scalar
+  >(dummy, t1, t2, s1, s2);
+
+  // kernel should have updated out args and returned them in a tuple
+  EXPECT_EQ(t1.item().toFloat(), 1.0f);
+  EXPECT_EQ(t2.item().toFloat(), 2.0f);
+
+  auto t1_out = std::get<0>(tup);
+  EXPECT_EQ(t1_out.item().toFloat(), 1.0f);
+  EXPECT_TRUE(t1_out.is_same(t1));
+
+  auto t2_out = std::get<1>(tup);
+  EXPECT_EQ(t2_out.item().toFloat(), 2.0f);
+  EXPECT_TRUE(t2_out.is_same(t2));
 }
 
-// make an unboxed call to a kernel that modifies its first two (Tensor) arguments
-// and returns them. When calling unboxed, these are returned as a tuple.
-//
-void expectUnboxedCallingWithMultipleTensorRefReturnWorks(const KernelFunction& func) {
+void expectLegacyOutOfPlaceMultiUnboxedCallingWorks(const KernelFunction& func) {
   OperatorHandle dummy = makeDummyOperatorHandle();
 
-  auto a = at::zeros({1});
-  auto b = at::zeros({1});
-  auto c = 1.0f;
-  auto d = 2.0f;
+  auto s1 = 1.0f;
+  auto s2 = 2.0f;
+  auto t1 = at::zeros({1});
+  auto t2 = at::zeros({1});
 
   std::tuple<at::Tensor&, at::Tensor&> tup = func.call<
-    std::tuple<at::Tensor&, at::Tensor&>,
-    at::Tensor&,
-    at::Tensor&,
-    at::Scalar,
-    at::Scalar
-  >(dummy, a, b, c, d);
+    std::tuple<at::Tensor&, at::Tensor&>, at::Scalar, at::Scalar, at::Tensor&, at::Tensor&
+  >(dummy, s1, s2, t1, t2);
 
-  // kernel should have updated args 0 and 1
-  EXPECT_EQ(a.item().toFloat(), 1.0f);
-  EXPECT_EQ(b.item().toFloat(), 2.0f);
+  // kernel should have updated out args and returned them in a tuple
+  EXPECT_EQ(t1.item().toFloat(), 1.0f);
+  EXPECT_EQ(t2.item().toFloat(), 2.0f);
 
-  // and returned a tuple containing them
-  auto ta = std::get<0>(tup);
-  EXPECT_EQ(ta.item().toFloat(), 1.0f);
-  EXPECT_TRUE(a.is_same(ta));
+  auto t1_out = std::get<0>(tup);
+  EXPECT_EQ(t1_out.item().toFloat(), 1.0f);
+  EXPECT_TRUE(t1_out.is_same(t1));
 
-  auto tb = std::get<1>(tup);
-  EXPECT_EQ(tb.item().toFloat(), 2.0f);
-  EXPECT_TRUE(b.is_same(tb));
+  auto t2_out = std::get<1>(tup);
+  EXPECT_EQ(t2_out.item().toFloat(), 2.0f);
+  EXPECT_TRUE(t2_out.is_same(t2));
 }
 
 }
 
+// functional, boxed calling
+
 TEST(KernelFunctionTest, givenBoxedFunction_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_return>();
   kernels::expectBoxedCallingWithReturnWorks(func);
@@ -360,16 +461,30 @@ TEST(KernelFunctionTest, givenBoxedFunction_withMultiReturn_whenCallingBoxed_the
   kernels::expectBoxedCallingWithMultiReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenBoxedFunction_withTensorRefReturn_whenCallingBoxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_tensor_ref_return>();
-  kernels::expectBoxedCallingWithTensorRefReturnWorks(func);
+// in/out, boxed calling
+
+TEST(KernelFunctionTest, givenBoxedFunction_withInPlaceSignature_whenCallingBoxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_inplace_op>();
+  kernels::expectInPlaceBoxedCallingWorks(func);
+}
+
+TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceSignature_whenCallingBoxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_op>();
+  kernels::expectOutOfPlaceBoxedCallingWorks(func);
 }
 
-TEST(KernelFunctionTest, givenBoxedFunction_withMultipleTensorRefReturn_whenCallingBoxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_multiple_tensor_ref_return>();
-  kernels::expectBoxedCallingWithMultipleTensorRefReturnWorks(func);
+TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceMultiSignature_whenCallingBoxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_multi_op>();
+  kernels::expectOutOfPlaceMultiBoxedCallingWorks(func);
 }
 
+TEST(KernelFunctionTest, givenBoxedFunction_withLegacyOutOfPlaceMultiSignature_whenCallingBoxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_legacy_outofplace_multi_op>();
+  kernels::expectLegacyOutOfPlaceMultiBoxedCallingWorks(func);
+}
+
+// functional, unboxed calling
+
 TEST(KernelFunctionTest, givenBoxedFunction_withReturn_whenCallingUnboxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_return>();
   kernels::expectUnboxedCallingWithReturnWorks(func);
@@ -385,16 +500,30 @@ TEST(KernelFunctionTest, givenBoxedFunction_withMultiReturn_whenCallingUnboxed_t
   kernels::expectUnboxedCallingWithMultiReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenBoxedFunction_withTensorRefReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_tensor_ref_return>();
-  kernels::expectUnboxedCallingWithTensorRefReturnWorks(func);
+// in/out, unboxed calling
+
+TEST(KernelFunctionTest, givenBoxedFunction_withInPlaceSignature_whenCallingUnboxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_inplace_op>();
+  kernels::expectInPlaceUnboxedCallingWorks(func);
 }
 
-TEST(KernelFunctionTest, givenBoxedFunction_withMultipleTensorRefReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_multiple_tensor_ref_return>();
-  kernels::expectUnboxedCallingWithMultipleTensorRefReturnWorks(func);
+TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceSignature_whenCallingUnboxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_op>();
+  kernels::expectOutOfPlaceUnboxedCallingWorks(func);
 }
 
+TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceMultiSignature_whenCallingUnboxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_multi_op>();
+  kernels::expectOutOfPlaceMultiUnboxedCallingWorks(func);
+}
+
+TEST(KernelFunctionTest, givenBoxedFunction_withLegacyOutOfPlaceMultiSignature_whenCallingUnboxed_thenWorks) {
+  KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_legacy_outofplace_multi_op>();
+  kernels::expectLegacyOutOfPlaceMultiUnboxedCallingWorks(func);
+}
+
+// functors etc.
+
 TEST(KernelFunctionTest, givenUnboxedFunctor_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromUnboxedFunctor<false, kernels::unboxed_functor_with_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_with_return>()));
   kernels::expectBoxedCallingWithReturnWorks(func);
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index 484d462b8ad9..4f9ae1fced70 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -71,50 +71,20 @@ using can_unbox =
   >;
 
 //
-// BoxedKernelWrapper
-//
-// For a given function type FT, BoxedKernelWrapper<FT> implements
-//
-// 1. a `boxArgs` method that boxes the function's arguments - i.e.,
-//    inserts each argument into an IValue that it pushes onto a
-//    torch::jit::Stack, which it returns
-//
-// 2. a `call` method that
-// - takes a boxed kernel and unboxed arguments as specified by FT,
-// - calls `boxArgs` to box the arguments
-// - calls the boxed kernel
-// - unboxes and returns the result
+// boxArgs - utility for pushing unboxed args onto IValue stack
 //
-// The partial specializations below handle various cases: in
-// particular, not all types appearing in op signatures are supported,
-// and ops returning references have nonstandard wrapper implementations.
-//
-
-// 1. The base specialization of BoxedKernelWrapper should never be instantiated.
-// A "no call method defined on BoxedKernelWrapper" compile error means that
-// an op signature has failed to trigger any of the partial specializations
-// that follow this one.
-//
-template <class FuncType, class Enable = void>
-struct BoxedKernelWrapper {
-  // The reason we're not just doing straight up static_assert(false, ...) here:
-  // Basically, the way to make sure a static_assert only fires if a template
-  // is actually instantiated (rather than every time the file is parsed) is to use
-  // template parameters in the expression, e.g. FuncType here. However, since
-  // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same
-  // effect.
-  static_assert(sizeof(FuncType) != sizeof(FuncType),
-    "Function signature contains one or more unsupported parameter and/or return types. "
-    "Look for a nearby error like "
-    "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
-    "- (your function type) is the unsupported signature.");
-};
+template <class... Args>
+static torch::jit::Stack boxArgs(Args... args) {
+  // TODO Reuse stack vector instead of allocating?
+  torch::jit::Stack stack;
+  stack.reserve(sizeof...(Args));
+  torch::jit::push(stack, std::forward<Args>(args)...);
+  return stack;
+}
 
 //
-// 2. Supported signatures, other than ref-passing.
-//
-
-// helper class whose specializations handle single and multiple return values, respectively
+// PopResult is a helper class whose specializations handle popping single and
+// multiple return values, respectively.
 //
 template <class Result>
 struct PopResult final {
@@ -151,6 +121,46 @@ struct PopResult<std::tuple<Types...>> final {
   }
 };
 
+//
+// BoxedKernelWrapper
+//
+// For a given function type FT, BoxedKernelWrapper<FT> implements
+// a `call` method that
+// - takes a boxed kernel and unboxed arguments as specified by FT,
+// - calls `boxArgs` to box the arguments
+// - calls the boxed kernel
+// - unboxes and returns the result
+//
+// The partial specializations below handle various cases: in
+// particular, not all types appearing in op signatures are supported,
+// and ops returning references have nonstandard wrapper implementations.
+//
+
+// 1. The base specialization of BoxedKernelWrapper should never be instantiated.
+// A "no call method defined on BoxedKernelWrapper" compile error means that
+// an op signature has failed to trigger any of the partial specializations
+// that follow this one.
+//
+template <class FuncType, class Enable = void>
+struct BoxedKernelWrapper {
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to use
+  // template parameters in the expression, e.g. FuncType here. However, since
+  // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same
+  // effect.
+  static_assert(sizeof(FuncType) != sizeof(FuncType),
+    "Function signature contains one or more unsupported parameter and/or return types. "
+    "Look for a nearby error like "
+    "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
+    "- (your function type) is the unsupported signature.");
+};
+
+//
+// 2. Supported signatures, other than those involving non-const Tensor refs -
+// i.e., "functional" ops.
+//
+
 template <class Result, class... Args>
 struct BoxedKernelWrapper<
   Result(Args...),
@@ -159,14 +169,6 @@ struct BoxedKernelWrapper<
     void
   >
 > {
-  static torch::jit::Stack boxArgs(Args... args) {
-    // TODO Reuse stack vector instead of allocating?
-    torch::jit::Stack stack;
-    stack.reserve(sizeof...(Args));
-    torch::jit::push(stack, std::forward<Args>(args)...);
-    return stack;
-  }
-
   static Result call(
     KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func,
     OperatorKernel* functor,
@@ -194,12 +196,15 @@ struct BoxedKernelWrapper<
 };
 
 //
-// 3. signatures taking a single Tensor reference as their first argument,
-// and also returning one.
+// 3. in-place and legacy out-of-place ops take a single non-const Tensor
+// reference as their first argument, and return it.
+//
+// Note: all signatures matching this pattern are are assumed to be for such ops.
+// Because of this, the generated BoxedKernelWrapper specializations simply
+// return the in-place argument.
 //
-// Note that the passed kernels are assumed to be for inplace/outplace ops,
-// and the generated BoxedKernelWrapper specializations will simply return
-// the initial argument.
+// TODO update comment when legacy out-of-place signatures no longer need
+// to be supported, due to hacky_wrapper reordering
 //
 
 template <class... OtherArgs>
@@ -207,21 +212,11 @@ struct BoxedKernelWrapper<
   at::Tensor&(at::Tensor&, OtherArgs...),
   std::enable_if_t<can_box_all<OtherArgs...>::value, void>
 > {
-  static torch::jit::Stack boxArgs(at::Tensor& outArg, OtherArgs... otherArgs) {
-    // TODO Reuse stack vector instead of allocating?
-    torch::jit::Stack stack;
-    stack.reserve(1 + sizeof...(OtherArgs));
-    torch::jit::push_one(stack, outArg);
-    torch::jit::push(stack, std::forward<OtherArgs>(otherArgs)...);
-    return stack;
-  }
-
   static at::Tensor& call(
     KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func,
     OperatorKernel* functor,
     const OperatorHandle& opHandle,
-    at::Tensor& outArg,
-    OtherArgs... otherArgs
+    at::Tensor& outArg, OtherArgs... otherArgs
   ) {
     torch::jit::Stack stack = boxArgs(outArg, otherArgs...);
     (*boxed_kernel_func)(functor, opHandle, &stack);
@@ -236,30 +231,75 @@ struct BoxedKernelWrapper<
 };
 
 //
-// 4. signatures returning a tuple of Tensor references, and taking the same
-// number of Tensor refs as their initial arguments.
+// 4. out of place ops that take a single non-const Tensor reference as their
+// final argument, and also return it.
 //
-// Note that the passed kernels are assumed to be for inplace/outplace ops,
-// and the generated BoxedKernelWrapper specializations will return a tuple
-// of those initial arguments.
+// Note: all signatures matching this pattern are are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to simply
+// return out arguments.
 //
+template <class FirstArg, class... RestArgs>
+struct BoxedKernelWrapper<
+  at::Tensor&(FirstArg, RestArgs...),
+  std::enable_if_t<
+    can_box_all<FirstArg, RestArgs...>::value
+    // this skips over in-place (and legacy out-of-place) kernels with a non-const Tensor
+    // arg at the front, so those can unambiguously trigger the preceding specialization.
+    // TODO update comment when hacky_wrapper reorders legacy out-of-place signatures
+    && !is_mutable_tensor_ref<FirstArg>::value,
+    void
+  >
+> {
+  static at::Tensor& call(
+    KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func,
+    OperatorKernel* functor,
+    const OperatorHandle& opHandle,
+    FirstArg firstArg, RestArgs... restArgs
+  ) {
+    torch::jit::Stack stack = boxArgs(firstArg, restArgs...);
+    (*boxed_kernel_func)(functor, opHandle, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == 1,
+      "Boxed kernel was expected to return a single value on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    return std::get<sizeof...(RestArgs) - 1>(std::tuple<RestArgs...>{restArgs...});
+  }
+};
 
+//
+// 5. out of place ops that take multiple non-const Tensor references as their
+// final arguments, and return them in a std::tuple.
+//
+// Note: all signatures matching this pattern are are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to simply
+// return the out arguments.
+//
 template <class Result, class... Args>
 struct BoxedKernelWrapper<
   Result(Args...),
   std::enable_if_t<
-    can_box_all<Args...>::value && is_tuple_of_mutable_tensor_refs<Result>::value,
+    can_box_all<Args...>::value && is_tuple_of_mutable_tensor_refs<Result>::value
+    // this test skips over legacy kernels with out args at the front, so they can trigger
+    // the specialization that follows.
+    // note: this test is complicated by the fact that boolean value expressions in templates
+    // don't shortcut. some signatures have a result tuple that's wider than the arg list, and
+    // without the length limiting ternary these will cause a template evaluation error on this
+    // test, even if a length check precedes it in the conjunction.
+    // TODO remove when hacky_wrapper reorders legacy kernel out args
+    && !std::is_same<
+        Result,
+        guts::typelist::to_tuple_t<
+          guts::typelist::take_t<
+            guts::typelist::typelist<Args...>,
+            sizeof...(Args) >= std::tuple_size<Result>::value ? std::tuple_size<Result>::value : sizeof...(Args)
+          >
+        >
+      >::value,
     void
   >
 > {
-  static torch::jit::Stack boxArgs(Args... args) {
-    // TODO Reuse stack vector instead of allocating?
-    torch::jit::Stack stack;
-    stack.reserve(sizeof...(Args));
-    torch::jit::push(stack, std::forward<Args>(args)...);
-    return stack;
-  }
-
   static Result call(
     KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func,
     OperatorKernel* functor,
@@ -277,15 +317,65 @@ struct BoxedKernelWrapper<
       "but instead returned ", stack.size(), " values."
     );
 
-    auto result = guts::tuple_take<ArgTuple, RetCount>(ArgTuple{args...});
+    auto result = guts::tuple_take<ArgTuple, -RetCount>(ArgTuple{args...});
     static_assert(
         std::is_same<Result, decltype(result)>::value,
         "The parameter list of an op returning a tuple of Tensor references "
-            "must begin with an equal number of Tensor reference parameters."
+            "must end with an equal number of Tensor reference parameters."
     );
     return result;
   }
 };
 
+//
+// 6. legacy trap for old-school multi-return out functions with mutable args
+// at start rather than end of arg list.
+// TODO remove when hacky_wrapper reorders legacy kernel out args
+//
+
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+  Result(Args...),
+  std::enable_if_t<
+    can_box_all<Args...>::value && is_tuple_of_mutable_tensor_refs<Result>::value
+    // this test fires passes for legacy kernels with out args at the front.
+    // note: this test is complicated by the fact that boolean value expressions in templates
+    // don't shortcut. some signatures have a result tuple that's wider than the arg list, and
+    // without the length limiting ternary these will cause a template evaluation error on this
+    // test, even if a length check precedes it in the conjunction.
+    && std::is_same<
+        Result,
+        guts::typelist::to_tuple_t<
+          guts::typelist::take_t<
+            guts::typelist::typelist<Args...>,
+            sizeof...(Args) >= std::tuple_size<Result>::value ? std::tuple_size<Result>::value : sizeof...(Args)
+          >
+        >
+      >::value,
+    void
+  >
+> {
+  static Result call(
+    KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func,
+    OperatorKernel* functor,
+    const OperatorHandle& opHandle,
+    Args... args
+  ) {
+    using ArgTuple = std::tuple<Args...>;
+    constexpr int RetCount = std::tuple_size<Result>();
+
+    torch::jit::Stack stack = boxArgs(args...);
+    (*boxed_kernel_func)(functor, opHandle, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == RetCount,
+      "Boxed kernel was expected to return ", RetCount, " values on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    auto legacy_result = guts::tuple_take<ArgTuple, RetCount>(ArgTuple{args...});
+    return legacy_result;
+  }
+};
+
 } // impl
 } // c10
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index a5f9354d7ca2..8c5fec73308e 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -387,7 +387,7 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(A
         seq_num = at::sequence_number::peek();
       }
       if (guard.needs_inputs) {
-        torch::jit::Stack stack = impl::BoxedKernelWrapper<Return(Args...)>::boxArgs(args...);
+        torch::jit::Stack stack = impl::boxArgs(args...);
         guard.before(op, stack, seq_num);
       } else {
         guard.before(op, seq_num);
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 88303c524aa1..9ea18dc8482d 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -82,6 +82,19 @@ struct OptionalArray {
   }
 };
 
+// Capsule is an internal implementation detail of custom C++ classes. We
+// define it as an owning wrapper for
+// c10::intrusive_ptr<torch::CustomClassHolder> This wrapper is here to serve as
+// an abstraction of the type erased custom class object pointer. It also allow
+// pybind11 to treat this as a standalone class to register as a separate type
+// caster, instead of a custom pointer holder which the pointer holder type
+// caster try to "unwrap" it automatically.
+struct Capsule {
+  c10::intrusive_ptr<torch::CustomClassHolder> obj_ptr;
+  explicit Capsule(c10::intrusive_ptr<torch::CustomClassHolder> ptr)
+      : obj_ptr(std::move(ptr)) {}
+};
+
 // IValue is the generic tagged union used by the interpreter to hold
 // all value types.
 // It is a 16-byte object with an 8-byte payload and an 8-byte tag.
@@ -327,8 +340,7 @@ struct CAFFE2_API IValue final {
   /// @private [doxygen private]
   c10::intrusive_ptr<caffe2::Blob> toBlob() const&;
 
-  // Capsule. Capsule is an internal implementation detail
-  // of custom C++ classes. No new callsites of these APIs should
+  // Capsule. No new callsites of these APIs should
   // be introduced.
   static inline IValue make_capsule(
       intrusive_ptr<torch::CustomClassHolder> blob);
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
index 1df674383fb3..83a5753fabd2 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -275,18 +275,7 @@ template <> class Vec256<c10::complex<double>> {
     return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
   }
   Vec256<c10::complex<double>> sqrt() const {
-    //   sqrt(a + bi)
-    // = sqrt(2)/2 * [sqrt(sqrt(a**2 + b**2) + a) + sgn(b)*sqrt(sqrt(a**2 + b**2) - a)i]
-    // = sqrt(2)/2 * [sqrt(abs() + a) + sgn(b)*sqrt(abs() - a)i]
-
-    const __m256d scalar = _mm256_set1_pd(std::sqrt(2)/2);             //sqrt(2)/2      sqrt(2)/2
-    const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
-    auto sign = _mm256_and_pd(values, sign_mask);
-    auto factor = _mm256_or_pd(scalar, sign);
-
-    auto a_a = _mm256_xor_pd(_mm256_movedup_pd(values), sign_mask);    // a             -a
-    auto res_re_im = _mm256_sqrt_pd(_mm256_add_pd(abs_(), a_a));       // sqrt(abs + a) sqrt(abs - a)
-    return _mm256_mul_pd(factor, res_re_im);
+    return map(std::sqrt);
   }
   Vec256<c10::complex<double>> reciprocal() const;
   Vec256<c10::complex<double>> rsqrt() const {
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
index dc8ef7cc76d6..28032651f636 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -313,18 +313,7 @@ template <> class Vec256<c10::complex<float>> {
     return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
   }
   Vec256<c10::complex<float>> sqrt() const {
-    //   sqrt(a + bi)
-    // = sqrt(2)/2 * [sqrt(sqrt(a**2 + b**2) + a) + sgn(b)*sqrt(sqrt(a**2 + b**2) - a)i]
-    // = sqrt(2)/2 * [sqrt(abs() + a) + sgn(b)*sqrt(abs() - a)i]
-
-    const __m256 scalar = _mm256_set1_ps(std::sqrt(2)/2);              //sqrt(2)/2      sqrt(2)/2
-    const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
-    auto sign = _mm256_and_ps(values, sign_mask);
-    auto factor = _mm256_or_ps(scalar, sign);
-
-    auto a_a = _mm256_xor_ps(_mm256_moveldup_ps(values), sign_mask);   // a             -a
-    auto res_re_im = _mm256_sqrt_ps(_mm256_add_ps(abs_(), a_a));       // sqrt(abs + a) sqrt(abs - a)
-    return _mm256_mul_ps(factor, res_re_im);
+    return map(std::sqrt);
   }
   Vec256<c10::complex<float>> reciprocal() const;
   Vec256<c10::complex<float>> rsqrt() const {
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 360069998f19..1780a553d73d 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -13,6 +13,11 @@
 #include <ATen/NamedTensorUtils.h>
 #include <torch/library.h>
 
+#ifdef USE_FBGEMM
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmConvert.h>
+#endif
+
 namespace {
 
 using namespace at;
@@ -94,6 +99,31 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   TORCH_CHECK(self.defined(), "self is undefined");
   TORCH_CHECK(src.defined(), "src is undefined");
 
+  // FBGeMM kernel support exists only for the following case,
+  // 1. Memory Format for source and destination tensors is contiguous.
+  // 2. Device for both the source and destination tensor is CPU.
+  // 3. dtype conversion between FP32->FP16 and FP16->FP32.
+  #ifdef USE_FBGEMM
+    if (((self.dtype() == at::kFloat && src.dtype() == at::kHalf) ||
+         (self.dtype() == at::kHalf && src.dtype() == at::kFloat)) &&
+        (self.device().is_cpu() && src.device().is_cpu()) &&
+        !self.is_sparse() && !src.is_sparse() &&
+        ((self.is_contiguous() && src.is_contiguous()) ||
+         (self.is_non_overlapping_and_dense() && self.strides() == src.strides()))) {
+      if (src.dtype() == at::kFloat && self.dtype() == at::kHalf) {
+        auto* output_ptr = reinterpret_cast<fbgemm::float16*>(
+            self.data_ptr<at::Half>());
+        fbgemm::FloatToFloat16_simd(src.data_ptr<float>(), output_ptr, self.numel());
+      } else {
+        auto in_data = reinterpret_cast<fbgemm::float16*>(
+            src.data_ptr<at::Half>());
+        auto* output_ptr = self.data_ptr<float>();
+        fbgemm::Float16ToFloat_simd(in_data, output_ptr, self.numel());
+      }
+      return self;
+    }
+  #endif
+
   if (self.is_sparse() && src.is_sparse()) {
     return at::copy_sparse_to_sparse_(self, src, non_blocking);
   } else if (self.is_sparse() || src.is_sparse()) {
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 7dd96b1c8d99..5066ca529869 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -149,48 +149,20 @@ void foreach_tensor_##OP##_scalarlist_slow_(TensorList input, TensorList tensors
 
 FOREACH_BINARY_OP_LIST_ALPHA(add);
 FOREACH_BINARY_OP_LIST_ALPHA(sub);
-
 FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
-
 FOREACH_BINARY_OP_SCALARLIST(add);
 FOREACH_BINARY_OP_SCALARLIST(sub);
 FOREACH_BINARY_OP_SCALARLIST(mul);
 FOREACH_BINARY_OP_SCALARLIST(div);
-
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
-
 FOREACH_UNARY_OP(sqrt);
 FOREACH_UNARY_OP(exp);
-FOREACH_UNARY_OP(abs);
-FOREACH_UNARY_OP(acos);
-FOREACH_UNARY_OP(asin);
-FOREACH_UNARY_OP(atan);
-FOREACH_UNARY_OP(ceil);
-FOREACH_UNARY_OP(cos);
-FOREACH_UNARY_OP(cosh);
-FOREACH_UNARY_OP(erf);
-FOREACH_UNARY_OP(erfc);
-FOREACH_UNARY_OP(expm1);
-FOREACH_UNARY_OP(floor);
-FOREACH_UNARY_OP(log);
-FOREACH_UNARY_OP(log10);
-FOREACH_UNARY_OP(log1p);
-FOREACH_UNARY_OP(log2);
-FOREACH_UNARY_OP(neg);
-FOREACH_UNARY_OP(tan);
-FOREACH_UNARY_OP(tanh);
-FOREACH_UNARY_OP(sin);
-FOREACH_UNARY_OP(sinh);
-FOREACH_UNARY_OP(round);
-FOREACH_UNARY_OP(lgamma);
-
 FOREACH_POINTWISE_OP_SCALAR(addcdiv);
 FOREACH_POINTWISE_OP_SCALAR(addcmul);
-
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul);
 
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index c9e03aaa3b6b..6f66c7a120fe 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -136,241 +136,331 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   return result;
 }
 
-Tensor einsum(std::string eqn, TensorList tensors) {
-  constexpr size_t number_of_letters = 26;
-  std::string in_eqn;
-  size_t pos;
-  // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis.
-  // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter
-  // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices.
-  // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that
-  // the letter has not been assigned an index yet (because it has not been seen).
-  // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices).
-  // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet.
-  // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below.
-
-  std::array<std::int64_t, number_of_letters> letter_mapping; // map letter to internal (numerical) label
-  letter_mapping.fill(-1);
-  int64_t num_ell_idxes = -1;
-  int64_t first_ell_idx = 0;
-
-  // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes.
-  // For each operand, we have a vector mapping each dimension to an internal index.
-  // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and
-  // of the last occurrence of each index.
-  std::vector<std::vector<int64_t>> input_op_idxes;                   // the parsed operand indices
-  std::array<std::int64_t, number_of_letters> num_letter_occurrences; // number of occurrence in the equation of this letter
-  num_letter_occurrences.fill(0);
-  std::vector<std::int64_t> last_idx_occurrence;                      // the last operator (left to right) using this index
-
-  if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side
-    in_eqn = eqn.substr(0, pos);
-  } else {
-    in_eqn = eqn;
-  }
-  // remove spaces for einsum compatibility (#9929)
-  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
-
-  // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
-  int64_t operand = 0;
-  std::stringstream eqn_stream(in_eqn);
-  std::string term;
-  int64_t num_total_idxes = 0;
-  while (! eqn_stream.eof()) {
-    std::getline(eqn_stream, term, ',');  // term = string with indices of current term
-    TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
-
-    int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
-    // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
-    int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3;
-    int64_t dims_in_term = 0;              // dimensions we have seen
-    std::vector<int64_t> current_op_idxes; // mapping of operand dimensions to indices for current term
-    for (auto &c : term) {                 // c = character with a single letter or '.'
-      if (c == '.') {
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
-        if (ell_char_count == 3) {        // this completes the ellipsis
-          if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
-            first_ell_idx = num_total_idxes;
-            num_ell_idxes = candidate_num_ell_idxes;
-            num_total_idxes += num_ell_idxes;
-          }
-          else {                          // we have seen an ellipsis before, so we check compatibility
-            TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes,
-                     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
-          }
-          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
-            current_op_idxes.push_back(first_ell_idx + i);
-            last_idx_occurrence.push_back(operand);
-          }
-          dims_in_term += num_ell_idxes;                // keep track of dimensions
-        }
-      } else {                                          // a letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
-        if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
-          letter_mapping[letter_num] = num_total_idxes;
-          num_total_idxes++;
-          last_idx_occurrence.push_back(operand);
-        } else {                                        // letter we have already seen
-          last_idx_occurrence[letter_mapping[letter_num]] = operand;
-        }
-        num_letter_occurrences[letter_num]++;
-        current_op_idxes.push_back(letter_mapping[letter_num]);
-        dims_in_term++;
-      }
+// There are roughly three parts to compute einsum:
+// 1. Parse equation to extract the labels for each input operand and output
+// 2. Unsqueeze missing dimensions from input operands and permute to align them
+// 3. Compute result by multiplying input operands and summing contraction
+//    dimensions We do the last part by reducing to bmm.
+Tensor einsum(std::string equation, TensorList operands) {
+  TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand");
+  checkDeviceType("einsum()", operands, operands[0].device().type());
+
+  // Code for encoding ellipsis ("...") with labels
+  constexpr int ELLIPSIS = '.';
+
+  // Find arrow (->) to split equation into lhs and rhs
+  const auto arrow_pos = equation.find("->");
+
+  // Convert labels for input operands into an index in [0, 25] and store
+  // them in op_labels for each operand along with ELLIPSIS.
+  std::string lhs = equation.substr(0, arrow_pos);
+  std::vector<std::vector<int>> op_labels(operands.size());
+  bool found_ell = false;
+  std::string::size_type curr_op = 0;
+  for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
+    switch (lhs[i]) {
+      case ' ':
+        // Ignore spaces
+        break;
+
+      case '.':
+        TORCH_CHECK(
+            // Only one ellipsis per operand can be given
+            !found_ell,
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " for which an ellipsis was already found");
+        TORCH_CHECK(
+            // Ensure it's a valid ellipsis
+            i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.',
+            "einsum() found \'.\' for operand ",
+            curr_op,
+            " that is not part of any ellipsis");
+        op_labels[curr_op].push_back(ELLIPSIS);
+        found_ell = true;
+        break;
+
+      case ',':
+        // Move onto next operand
+        ++curr_op;
+        TORCH_CHECK(
+            curr_op < operands.size(),
+            "einsum() fewer operands were provided than specified in the equation");
+        found_ell = false;
+        break;
+
+      default:
+        // Parse label
+        TORCH_CHECK(
+            lhs[i] >= 'a' && lhs[i] <= 'z',
+            "einsum() operand subscript must be in range [a, z] but found ",
+            lhs[i],
+            " for operand ",
+            curr_op);
+        // Convert label to index in [0, 25] and store
+        op_labels[curr_op].push_back(lhs[i] - 'a');
     }
-    TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
-    input_op_idxes.push_back(std::move(current_op_idxes));
-    operand++;
   }
-  // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
-  TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
-
-  // the following parses or infers output (right hand side)
-  // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the output indices. -1 means that the index has not been assigned a dimension yet
-  std::vector<int64_t> idxes_to_preprocessed_dims(num_total_idxes, -1);     // the position of the index in the tensor dimensions
-  int64_t num_output_dims = 0;
-  if (pos != std::string::npos) {            // parse the user provided right hand side
-    int64_t ell_char_count = 0;
-    for (auto &c : eqn.substr(pos+2)) {
-      if (c == '.') {                        // '.' as part of ellipsis
-        ell_char_count++;
-        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
-        if (ell_char_count == 3) {           // ellipsis complete
-          TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
-          for (int64_t i = 0; i < num_ell_idxes; ++i) {
-            idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-            num_output_dims++;
-          }
-        }
-      } else if (! isspace(c)) {                              // letter (hopefully)
-        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
-        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
-        int64_t letter_num = c-'a';
-        TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output");
-        idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
-        num_output_dims++;
+
+  TORCH_CHECK(
+      curr_op == operands.size() - 1,
+      "einsum() more operands were provided than specified in the equation");
+
+  // Labels must be within [a, z].
+  constexpr int total_labels = 'z' - 'a' + 1;
+  std::vector<int> label_count(total_labels, 0);
+
+  // The maximum number of dimensions covered by any ellipsis, needed when
+  // unsqueezing missing dimensions from operands to permute and broadcast
+  int64_t ell_num_dim = 0;
+
+  // Compute label frequency and number of dimensions covered by ellipsis
+  // We do this after parsing labels to make it more readable and simpler
+  // to compute the number of dimensions covered by ellipsis.
+  for (std::size_t i = 0; i < operands.size(); ++i) {
+    Tensor operand = operands[i];
+    std::vector<int> labels = op_labels[i];
+    int64_t nlabels = labels.size();
+    int64_t ndims = operand.dim();
+    bool has_ellipsis = false;
+
+    for (int label : labels) {
+      if (label == ELLIPSIS) {
+        --nlabels;
+        has_ellipsis = true;
+        ell_num_dim = std::max(ell_num_dim, ndims - nlabels);
+      } else {
+        ++label_count[label];
       }
     }
-  } else { // create an inferred right hand side
-    // the ellipsis (if in the lhs) comes first
-    if (num_ell_idxes >= 0) {
-      for (int64_t i = 0; i < num_ell_idxes; ++i) {
-        idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
-        num_output_dims++;
+
+    TORCH_CHECK(
+        has_ellipsis ? nlabels <= ndims : nlabels == ndims,
+        "einsum() the number of subscripts in the equation (",
+        nlabels,
+        has_ellipsis ? ") is more than the number of dimensions ("
+                     : ") does not match the number of dimensions (",
+        ndims,
+        ") for operand ",
+        i,
+        has_ellipsis ? "" : " and no ellipsis was given");
+  }
+
+  // Mapping of label to index in the permuted tensors (out_dims + sum_dims)
+  // This will be used for aligning the dimensions of all input operands
+  std::vector<int> label_perm_index(total_labels, -1);
+  
+  // Current index in the permuted shape
+  int perm_index = 0;
+
+  // Start index of ellipsis dimensions in the permuted shape
+  int64_t ell_index = 0;
+
+  if (arrow_pos == std::string::npos) {
+    // Implicit output is ellipsis (...) + labels seen only once
+    perm_index = ell_num_dim;
+    for (int label = 0; label < total_labels; ++label) {
+      if (label_count[label] == 1) {
+        label_perm_index[label] = perm_index++;
       }
     }
-    // then the indices that occur exactly once in alphabetic order
-    for (size_t idx = 0; idx < number_of_letters; idx++) {
-      if (num_letter_occurrences[idx] == 1) {
-        idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims;
-        num_output_dims++;
+  } else {
+    // Parse explicit output
+    std::string rhs = equation.substr(arrow_pos + 2);
+    found_ell = false;
+    for (std::size_t i = 0; i < rhs.length(); ++i) {
+      switch (rhs[i]) {
+        case ' ':
+          // Ignore spaces
+          break;
+
+        case '.':
+          TORCH_CHECK(
+              // There can only be one ellipsis in the output
+              !found_ell,
+              "einsum() found \'.\' for output but an ellipsis (...) was already found");
+          TORCH_CHECK(
+              // Ensure ellipsis is correct
+              i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.',
+              "einsum() found \'.\' for output that is not part of any ellipsis (...)");
+          ell_index = perm_index;
+          perm_index += ell_num_dim;
+          found_ell = true;
+          break;
+
+        default:
+          TORCH_CHECK(
+              rhs[i] >= 'a' && rhs[i] <= 'z',
+              "einsum() subscripts must be in range [a, z] but found ",
+              rhs[i],
+              " for the output");
+          TORCH_CHECK(
+              // Ensure label appeared at least once for some input operand and at
+              // most once for the output
+              label_count[rhs[i] - 'a'] > 0,
+              "einsum() output subscript ",
+              rhs[i],
+              label_count[rhs[i] - 'a'] == -1
+                  ? " appears more than once in the output"
+                  : " does not appear in the equation for any input operand");
+          label_perm_index[rhs[i] - 'a'] = perm_index++;
+          
+          // Set to -1 to mark that this label already appeared in the output
+          label_count[rhs[i] - 'a'] = -1;
       }
     }
+
+    TORCH_CHECK(
+        // Dimensions under ellipsis are not contracted, so ensure it appears in output
+        ell_num_dim <= 0 || found_ell,
+        "einsum() ellipsis (...) covering one or more dimensions was given in the input but not in the output");
   }
-  // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
-  // for the non-output indices - those that are eventually summed over
-  int64_t position = num_output_dims;
-  for (int64_t i = 0; i < num_total_idxes; i++) {
-    if (idxes_to_preprocessed_dims[i]==-1) {
-      idxes_to_preprocessed_dims[i] = position;
-      position++;
+
+  // Save output size before adding sum dims
+  int out_size = perm_index;
+
+  // Add contraction labels (labels not present in output)
+  for (int label = 0; label < total_labels; ++label) {
+    if (label_count[label] > 0 && label_perm_index[label] == -1) {
+      label_perm_index[label] = perm_index++;
     }
   }
 
-  // we now "homogenize the dimensions", i.e.
-  // - take diagonals for duplicated indices
-  // - permute the dimensions to match the order given by idxes_to_preprocessed_dims
-  // - unsqueeze to create all dimensions for each index in each tensor where they are missing
-  // we also check that sizes match
-  // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable)
-  std::vector<Tensor> preprocessed_operands;
-  std::vector<std::int64_t> size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet
-  for (int64_t op = 0; op < (int64_t) tensors.size(); op++) {
-    auto preprocessed_op = tensors[op];
-    std::vector<int64_t> idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear
-    std::vector<int64_t>& current_op_input_idxes = input_op_idxes[op];
-    int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input
-    for (size_t i = 0; i < current_op_input_idxes.size(); i++) {
-      auto idx = current_op_input_idxes[i];
-      auto dim_out = idxes_to_preprocessed_dims[idx];
-      if (idx_to_dim[dim_out] == -1) { // first appearance
-        idx_to_dim[dim_out] = dim;
-        if (size_of_dims[idx] == -1) { // keep track of sizes
-          size_of_dims[idx] = preprocessed_op.size(dim);
+  // Here we unsqueeze missing dimensions to make all operands have the same
+  // number of dimensions. We take diagonals for repeated labels within the
+  // same operand. Finally we permute the operands to align dimensions as 
+  // per the perm_out_index we computed above.
+  std::vector<Tensor> permuted_operands;
+  for (std::size_t i = 0; i < operands.size(); ++i) {
+    std::vector<int64_t> perm_shape(perm_index, -1);
+    std::vector<int64_t> label_dim(total_labels, -1);
+    std::vector<int> labels = op_labels[i];
+    Tensor operand = operands[i];
+    std::size_t j = 0;
+
+    for (int label : labels) {
+      if (label == ELLIPSIS) {
+        // Add missing dimensions under ellipsis
+        int64_t num_dim_diff =
+            ell_num_dim - (operand.dim() - labels.size() + 1);
+        for (int64_t k = 0; k < num_dim_diff; ++k) {
+          operand = operand.unsqueeze(j);
         }
-        else {
-          TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+        for (int64_t k = 0; k < ell_num_dim; ++k) {
+          perm_shape[ell_index + k] = j++;
         }
-        dim++;
-      } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
-        TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
-        preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
-        // diagonal moves the diagonal dimension to the back
-        // now we permute the last dim back to idx_to_dim[dim_out]
-        std::vector<int64_t> perm(preprocessed_op.dim(), 0);
-        for (int64_t d = 0; d < preprocessed_op.dim(); d++) {
-          if (d == idx_to_dim[dim_out]) {
-            perm[d] = preprocessed_op.dim() - 1;
-          } else {
-            perm[d] = d - (d > idx_to_dim[dim_out]);
-          }
-        }
-        preprocessed_op = preprocessed_op.permute(perm);
+      } else if (label_dim[label] != -1) {
+        // Repeated label, take diagonal
+        int64_t dim = label_dim[label];
+        TORCH_CHECK(
+            operand.size(j) == operand.size(dim),
+            "einsum() subscript ",
+            char(label + 'a'),
+            " is repeated for operand ",
+            i,
+            " but the sizes don't match, ",
+            operand.size(j),
+            " != ",
+            operand.size(dim));
+        operand = operand.diagonal(0, j, dim).movedim(-1, dim);
+      } else {
+        // Lookup output index for label
+        label_dim[label] = j;
+        perm_shape[label_perm_index[label]] = j++;
       }
     }
-    // now we permute the dimensions in the right order
-    std::vector<int64_t> permutation; // permutation for this tensor
-    for (auto &d : idx_to_dim) {
-      if (d > -1) {
-        permutation.push_back(d);
+
+    // Add dimensions for missing labels
+    for (int64_t& index : perm_shape) {
+      if (index == -1) {
+        operand = operand.unsqueeze(-1);
+        index = j++;
       }
     }
-    preprocessed_op = preprocessed_op.permute(permutation);
-    // finally, we insert dimensions for idxes not in the operand
-    for (size_t dim = 0; dim < idx_to_dim.size(); dim++) {
-      if (idx_to_dim[dim] == -1) {
-        preprocessed_op = preprocessed_op.unsqueeze(dim);
+
+    permuted_operands.push_back(operand.permute(perm_shape));
+  }
+
+  // Check if operands broadcast and keep track of last operand with
+  // dimension size != 1 for optimizing reductions
+  std::vector<std::size_t> dim_last_op(perm_index, 0);
+  bool has_zero_size_dim = false;
+  for (int dim = 0; dim < perm_index; ++dim) {
+    int64_t broadcast_size = permuted_operands[0].size(dim);
+    for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
+      int64_t dim_size = permuted_operands[i].size(dim);
+      if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) {
+        std::ostringstream msg;
+        msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:";
+        for (std::size_t j = 0; j < operands.size(); ++j) {
+          msg << " " << operands[j].sizes() << "->"
+              << permuted_operands[j].sizes();
+        }
+        TORCH_CHECK(false, msg.str());
+      }
+      if (dim_size != 1) {
+        broadcast_size = dim_size;
+        dim_last_op[dim] = i;
       }
     }
+    has_zero_size_dim |= broadcast_size == 0;
+  }
 
-    preprocessed_operands.push_back(std::move(preprocessed_op));
+  // Compute result
+  Tensor result = permuted_operands[0];
+
+  // Fast path for when an operand has zero sized dim
+  if (has_zero_size_dim) {
+    std::vector<int64_t> out_shape(out_size);
+    for (int i = 0; i < out_size; ++i) {
+      out_shape[i] = permuted_operands[dim_last_op[i]].size(i);
+    }
+    return at::zeros(out_shape, result.options());
   }
 
-  // now we reduce the indices from left to right
-  // numpy allows to optimize the path using various
-  // algorithms (see eigen_path in numpy docs)
-  // we start with the leftmost operator and reduce indices that
-  // appear only there
-  Tensor result = std::move(preprocessed_operands[0]);
-  for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-    if ((last_idx_occurrence[idx] == 0)
-        && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-      result = result.sum(idxes_to_preprocessed_dims[idx], true);
+  // Sum out or squeeze dimensions that are size 1 for all later operands
+  int dim = out_size;
+  for (int i = dim; i < perm_index; ++i, ++dim) {
+    if (dim_last_op[i] == 0) {
+      if (result.size(dim) == 1) {
+        result = result.squeeze(dim--);
+      } else {
+        result = result.sum(dim--);
+      }
     }
   }
 
-  // now we process each tensor using sumproduct_pair
-  for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) {
+  for (std::size_t i = 1; i < permuted_operands.size(); ++i) {
+    Tensor operand = permuted_operands[i];
     std::vector<int64_t> sum_dims;
-    for (int64_t idx = 0; idx < num_total_idxes; idx++) {
-      if ((last_idx_occurrence[idx] == i)
-          && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) {
-        sum_dims.push_back(idxes_to_preprocessed_dims[idx]);
+
+    // Sum out or squeeze dimensions that are size 1 for all later operands
+    dim = out_size;
+    for (int j = dim; j < perm_index; ++j, ++dim) {
+      if (dim_last_op[j] < i) {
+        operand = operand.squeeze(dim);
+        --dim;
+      } else if (dim_last_op[j] == i) {
+        if (result.size(dim) == 1) {
+          operand = operand.sum(dim);
+          result = result.squeeze(dim);
+          --dim;
+        } else {
+          sum_dims.push_back(dim);
+        }
       }
     }
-    result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true);
-  }
-  // finally, we squeeze out all non-result dimensions
-  auto sizes = result.sizes().vec();
-  for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) {
-    sizes.erase(sizes.begin() + dim);
+
+    // Multiply tensors and sum out dimensions in sum_dims
+    if (sum_dims.empty()) {
+      result = result.mul(operand);
+    } else if (sum_dims.size() == result.sizes().size()) {
+      result = result.flatten().dot(operand.flatten());
+    } else {
+      result = sumproduct_pair(result, operand, sum_dims, false);
+    }
   }
 
-  result = result.view(sizes);
   return result;
 }
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 0b3f9e518b6e..64aaea298093 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -730,8 +730,6 @@ Tensor all(const Tensor& self) {
               "all only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "all only supports strided layout, got: ", self.layout());
-  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
-    "all only supports torch.uint8 and torch.bool dtypes");
 
   Tensor result = at::empty({0}, self.options());
   auto iter = make_reduction(
@@ -749,8 +747,7 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
               "all only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "all only supports strided layout, got: ", self.layout());
-  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
-    "all only supports torch.uint8 and torch.bool dtypes");
+
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
     return result;
@@ -776,8 +773,6 @@ Tensor any(const Tensor& self) {
               "any only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided || self.layout() == Layout::Sparse,
               "any only supports strided AND sparse layout, got: ", self.layout());
-  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
-    "all only supports torch.uint8 and torch.bool dtypes");
 
   Tensor result = at::empty({0}, self.options());
   auto iter = make_reduction(
@@ -795,8 +790,7 @@ Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
               "any only supports CPU AND CUDA device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "any only supports strided layout, got: ", self.layout());
-  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
-    "all only supports torch.uint8 and torch.bool dtypes");
+
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
     return result;
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 205b08d86423..429b6f49a7bd 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -156,17 +156,15 @@ static void allocate_reduction_result(
 }
 
 static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, bool keepdim) {
+  if (keepdim) {
+    return result;
+  }
   auto shape = DimVector(result.sizes());
   auto stride = DimVector(result.strides());
   for (int dim = 0; dim < ndim; dim++) {
     if (mask[dim]) {
-      if (!keepdim) {
-        shape.insert(shape.begin() + dim, 1);
-        stride.insert(stride.begin() + dim, 0);
-      } else {
-        TORCH_INTERNAL_ASSERT(shape[dim] == 1);
-        stride[dim] = 0;
-      }
+      shape.insert(shape.begin() + dim, 1);
+      stride.insert(stride.begin() + dim, 0);
     }
   }
   return result.as_strided(shape, stride);
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index ae80a9e41be9..437a39bf2b92 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -340,6 +340,56 @@ struct NanSumOps {
 #endif
 };
 
+template <typename acc_t>
+struct AndOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
+    return static_cast<bool>(a) && static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return static_cast<bool>(a) && static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
+template <typename acc_t>
+struct OrOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
+    return static_cast<bool>(a) || static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return static_cast<bool>(a) || static_cast<bool>(b);
+  }
+
+  inline C10_DEVICE acc_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
 namespace detail {
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index 81a7ae3b9a9d..cb37b2055eda 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -542,15 +542,6 @@ void TensorIterator::coalesce_dimensions() {
   auto can_coalesce = [&](int dim0, int dim1) {
     auto shape0 = shape_[dim0];
     auto shape1 = shape_[dim1];
-    if (is_reduction_) {
-      // The dimension being reduced should not be coalesced
-      for (int i = 0; i < noutputs(); i++) {
-        auto& stride = operands_[i].stride_bytes;
-        if (stride[dim0] == 0 || stride[dim1] == 0) {
-          return false;
-        }
-      }
-    }
     if (shape0 == 1 || shape1 == 1) {
       return true;
     }
@@ -811,7 +802,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) {
   for (auto& op : operands_) {
     op.data = ((char*)op.data) + op.stride_bytes[dim] * start;
   }
-  if (size == 1) {
+  if (size == 1 && !is_reduction_) {
     coalesce_dimensions();
   }
 }
@@ -1406,24 +1397,4 @@ std::array<int64_t, 2> DimCounter::max_2d_step() const {
   return {step0, step1};
 }
 
-std::ostream& operator<<(std::ostream& os, const TensorIterator& iter) {
-  os << "TensorIterator @ " << &iter << " {" << std::endl;
-  os << "  ntensors() = " << iter.ntensors() << std::endl;
-  os << "  noutputs() = " << iter.noutputs() << std::endl;
-  os << "  shape() = " << iter.shape() << std::endl;
-  os << "  strides(*) = {" << std::endl;
-  for (int i = 0; i < iter.ntensors(); i++) {
-    os << "    (" << i << ") = " << iter.strides(i) << std::endl;
-  }
-  os << "  }" << std::endl;
-  os << "  dtype(*) = {" << std::endl;
-  for (int i = 0; i < iter.ntensors(); i++) {
-    os << "    (" << i << ") = " << iter.dtype(i) << std::endl;
-  }
-  os << "  }" << std::endl;
-  os << "  is_reduction_ = " << iter.is_reduction_ << std::endl;
-  os << "}";
-  return os;
-}
-
 }  // namespace at
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 3a9612e158ae..febf21a290dd 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <iostream>
 #include <ATen/ATen.h>
 #include <c10/util/FunctionRef.h>
 #include <c10/util/SmallVector.h>
@@ -297,8 +296,6 @@ struct CAFFE2_API TensorIterator {
     return true;
   }
 
-  friend CAFFE2_API std::ostream& operator<<(std::ostream& os, const TensorIterator& iter);
-
 protected:
   void build(TensorIteratorConfig&);
 
@@ -536,6 +533,4 @@ struct CAFFE2_API SplitUntil32Bit {
   const TensorIterator& iter;
 };
 
-CAFFE2_API std::ostream& operator<<(std::ostream& os, const TensorIterator& iter);
-
 }  // namespace at
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index d56582467894..c3207604f34a 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -232,41 +232,55 @@ static void norm_kernel_tensor_iterator_impl(
 }
 
 static void and_kernel_impl(TensorIterator& iter) {
-  binary_kernel_reduce_vec(
-    iter,
-    [=](uint8_t a, uint8_t b) -> uint8_t { return a && b; },
-    [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
-      // Adding the implementation here instead of in vec256_base to avoid
-      // return value inconsistency. Other comparison operators in vec256_base
-      // return -1/0 (all bit 1 / all bit 0) as true/false to follow the AVX2
-      // convention. This would be convenient when combined with other
-      // vectorized operations. For example, one can use the logical operation
-      // results as a mask for a bit operation to retrieve/reset multiple
-      // elements in a vector.
-      //
-      // In this method, users would expect, e.g., all(), to return 1/0 as
-      // true/false.
-      Vec256<uint8_t> c = Vec256<uint8_t>();
-      for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
-        c[i] = a[i] && b[i];
-      }
-      return c;
-    },
-    /*ident=*/true);
+  if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+    binary_kernel_reduce_vec(
+        iter,
+        [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
+        [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+          // Adding the implementation here instead of in vec256_base to avoid
+          // return value inconsistency. Other comparison operators in
+          // vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
+          // follow the AVX2 convention. This would be convenient when combined
+          // with other vectorized operations. For example, one can use the
+          // logical operation results as a mask for a bit operation to
+          // retrieve/reset multiple elements in a vector.
+          //
+          // In this method, users would expect, e.g., all(), to return 1/0 as
+          // true/false.
+          Vec256<uint8_t> c = Vec256<uint8_t>();
+          for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+            c[i] = (a[i] && b[i]) ? 1 : 0;
+          }
+          return c;
+        },
+        /*ident=*/true);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "and_kernel", [&]() {
+      binary_kernel_reduce(
+          iter, AndOps<scalar_t>(), static_cast<scalar_t>(true));
+    });
+  }
 }
 
 static void or_kernel_impl(TensorIterator& iter) {
-  binary_kernel_reduce_vec(
-    iter,
-    [=](uint8_t a, uint8_t b) -> uint8_t { return a || b; },
-    [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
-      Vec256<uint8_t> c = Vec256<uint8_t>();
-      for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
-        c[i] = a[i] || b[i];
-      }
-      return c;
-    },
-    /*ident=*/false);
+  if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+    binary_kernel_reduce_vec(
+        iter,
+        [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
+        [=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
+          Vec256<uint8_t> c = Vec256<uint8_t>();
+          for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
+            c[i] = (a[i] || b[i]) ? 1 : 0;
+          }
+          return c;
+        },
+        /*ident=*/false);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "or_kernel", [&]() {
+      binary_kernel_reduce(
+          iter, OrOps<scalar_t>(), static_cast<scalar_t>(false));
+    });
+  }
 }
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index d6a8709ca967..13f278b0d900 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -4,86 +4,6 @@
 
 namespace at { namespace native {
 
-template <template<class> class Op>
-std::vector<Tensor> foreach_unary_op_complex(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    std::vector<at::Tensor> vec_res;
-    vec_res.reserve(tensors.size());
-    for (const auto& t: tensors) {
-        vec_res.emplace_back(at::native::empty_like(t));
-    }
-
-    tensor_lists.emplace_back(tensors.vec());
-    tensor_lists.emplace_back(std::move(vec_res));
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<2>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 2,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 1>(),
-                              Op<opmath_t>());
-    });
-    return tensor_lists[1];
-}
-
-template <template<class> class Op>
-void foreach_unary_op_complex_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<1>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 1,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 0>(),
-                              Op<opmath_t>());
-    });
-}
-
-template <template<class> class Op>
-std::vector<Tensor> foreach_unary_op_complex_bfloat16(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    std::vector<at::Tensor> vec_res;
-    vec_res.reserve(tensors.size());
-    for (const auto& t: tensors) {
-        vec_res.emplace_back(at::native::empty_like(t));
-    }
-
-    tensor_lists.emplace_back(tensors.vec());
-    tensor_lists.emplace_back(std::move(vec_res));
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<2>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 2,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 1>(),
-                              Op<opmath_t>());
-    });
-    return tensor_lists[1];
-}
-
-template <template<class> class Op>
-void foreach_unary_op_complex_bfloat16_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<1>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 1,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 0>(),
-                              Op<opmath_t>());
-    });
-}
-
 template <template<class> class Op>
 std::vector<Tensor> foreach_unary_op(TensorList tensors) {
     std::vector<std::vector<at::Tensor>> tensor_lists;
@@ -96,7 +16,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
     tensor_lists.emplace_back(tensors.vec());
     tensor_lists.emplace_back(std::move(vec_res));
 
-    AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<2>(tensor_lists,
                               UnaryOpFunctor<scalar_t,
@@ -113,63 +33,7 @@ void foreach_unary_op_(TensorList tensors) {
     std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<1>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 1,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 0>(),
-                              Op<opmath_t>());
-    });
-}
-
-template <template<class> class Op>
-void foreach_op_unary_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<1>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 1,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 0>(),
-                              Op<opmath_t>());
-    });
-}
-
-template <template<class> class Op>
-std::vector<Tensor> foreach_unary_op_bfloat16(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    std::vector<at::Tensor> vec_res;
-    vec_res.reserve(tensors.size());
-    for (const auto& t: tensors) {
-        vec_res.emplace_back(at::native::empty_like(t));
-    }
-
-    tensor_lists.emplace_back(tensors.vec());
-    tensor_lists.emplace_back(std::move(vec_res));
-
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
-        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
-        multi_tensor_apply<2>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 2,
-                                             /* r_args_depth */ 1, 
-                                             /* res_arg_index */ 1>(),
-                              Op<opmath_t>());
-    });
-    return tensor_lists[1];
-}
-
-template <template<class> class Op>
-void foreach_unary_op_bfloat16_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists;
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
         using opmath_t = get_opmath_t<scalar_t>::opmath_t;
         multi_tensor_apply<1>(tensor_lists,
                               UnaryOpFunctor<scalar_t,
@@ -180,54 +44,6 @@ void foreach_unary_op_bfloat16_(TensorList tensors) {
     });
 }
 
-#define FOREACH_UNARY_OP_COMPLEX(NAME, NAME1)                           \
-template<typename T>                                                    \
-struct NAME1 {                                                          \
-    __device__ T operator()(T t) const { return std::NAME(t); }         \
-};                                                                      \
-                                                                        \
-std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors) {  \
-    check_foreach_api_restrictions(tensors);                            \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow(tensors);       \
-    }                                                                   \
-                                                                        \
-    return foreach_unary_op_complex<NAME1>(tensors);                    \
-}                                                                       \
-                                                                        \
-void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
-    check_foreach_api_restrictions(tensors);                            \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow_(tensors);      \
-    }                                                                   \
-                                                                        \
-    foreach_unary_op_complex_<NAME1>(tensors);                          \
-}
-
-#define FOREACH_UNARY_OP_COMPLEX_BFLOAT16(NAME, NAME1)                  \
-template<typename T>                                                    \
-struct NAME1 {                                                          \
-    __device__ T operator()(T t) const { return std::NAME(t); }         \
-};                                                                      \
-                                                                        \
-std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors) {  \
-    check_foreach_api_restrictions(tensors);                            \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow(tensors);       \
-    }                                                                   \
-                                                                        \
-    return foreach_unary_op_complex_bfloat16<NAME1>(tensors);           \
-}                                                                       \
-                                                                        \
-void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
-    check_foreach_api_restrictions(tensors);                            \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow_(tensors);      \
-    }                                                                   \
-                                                                        \
-    foreach_unary_op_complex_bfloat16_<NAME1>(tensors);                 \
-}
-
 #define FOREACH_UNARY_OP(NAME, NAME1)                                   \
 template<typename T>                                                    \
 struct NAME1 {                                                          \
@@ -252,101 +68,7 @@ void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
     foreach_unary_op_<NAME1>(tensors);                                  \
 }
 
-#define FOREACH_UNARY_OP_BFLOAT16(NAME, NAME1)                          \
-template<typename T>                                                    \
-struct NAME1 {                                                          \
-    __device__ T operator()(T t) const { return std::NAME(t); }         \
-};                                                                      \
-                                                                        \
-std::vector<Tensor> foreach_tensor_##NAME##_cuda(TensorList tensors) {  \
-    check_foreach_api_restrictions(tensors);                            \
-                                                                        \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow(tensors);       \
-    }                                                                   \
-                                                                        \
-    return foreach_unary_op_bfloat16<NAME1>(tensors);                   \
-}                                                                       \
-                                                                        \
-void foreach_tensor_##NAME##_cuda_(TensorList tensors) {                \
-    check_foreach_api_restrictions(tensors);                            \
-                                                                        \
-    if (!can_use_fast_route(tensors)) {                                 \
-        return at::native::foreach_tensor_##NAME##_slow_(tensors);      \
-    }                                                                   \
-                                                                        \
-    foreach_unary_op_bfloat16_<NAME1>(tensors);                         \
-}
-
-FOREACH_UNARY_OP(ceil, Ceil);
-FOREACH_UNARY_OP(erfc, Erfc);
-FOREACH_UNARY_OP(expm1, Expm1);
-FOREACH_UNARY_OP(floor, Floor);
-FOREACH_UNARY_OP(lgamma, Lgamma);
-
-FOREACH_UNARY_OP_BFLOAT16(log1p, Log1p);
-FOREACH_UNARY_OP_BFLOAT16(erf, Erf);
-
-FOREACH_UNARY_OP_COMPLEX(acos, Acos);
-FOREACH_UNARY_OP_COMPLEX(asin, Asin);
-FOREACH_UNARY_OP_COMPLEX(atan, Atan);
-FOREACH_UNARY_OP_COMPLEX(cosh, Cosh);
-FOREACH_UNARY_OP_COMPLEX(tan, Tan);
-FOREACH_UNARY_OP_COMPLEX(sin, Sin);
-FOREACH_UNARY_OP_COMPLEX(sinh, Sinh);
-
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(abs, Abs);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(exp, Exp);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(sqrt, Sqrt);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(cos, Cos);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(tanh, Tanh);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log, Log);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log10, Log10);
-FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log2, Log2);
-
-std::vector<Tensor> foreach_tensor_neg_cuda(TensorList tensors) {
-    check_foreach_api_restrictions(tensors);
-
-    if (!can_use_fast_route(tensors)) {
-        return at::native::foreach_tensor_neg_slow(tensors);
-    }
-
-    return foreach_unary_op_complex_bfloat16<std::negate>(tensors);
-}
-
-void foreach_tensor_neg_cuda_(TensorList tensors) {
-    check_foreach_api_restrictions(tensors);
-
-    if (!can_use_fast_route(tensors)) {
-        return at::native::foreach_tensor_neg_slow_(tensors);
-    }
-
-    foreach_unary_op_complex_bfloat16_<std::negate>(tensors);
-}
-
-template<typename T>                                                    \
-struct Round {                                                          \
-    __device__ T operator()(T t) const { return std::nearbyint(t); }    \
-};
-
-std::vector<Tensor> foreach_tensor_round_cuda(TensorList tensors) {
-    check_foreach_api_restrictions(tensors);
-
-    if (!can_use_fast_route(tensors)) {
-        return at::native::foreach_tensor_round_slow(tensors);
-    }
-
-    return foreach_unary_op<Round>(tensors);
-}
-
-void foreach_tensor_round_cuda_(TensorList tensors) {
-    check_foreach_api_restrictions(tensors);
-
-    if (!can_use_fast_route(tensors)) {
-        return at::native::foreach_tensor_round_slow_(tensors);
-    }
-
-    foreach_unary_op_<Round>(tensors);
-}
+FOREACH_UNARY_OP(exp, Exp);
+FOREACH_UNARY_OP(sqrt, Sqrt);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
index ca2db43637dd..a29a926ef257 100644
--- a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu
@@ -8,17 +8,25 @@
 namespace at { namespace native {
 
 void and_kernel_cuda(TensorIterator& iter) {
-  gpu_reduce_kernel<uint8_t, uint8_t>(
-    iter, func_wrapper<uint8_t> ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t {
-      return a && b;
-    }), true);
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "and_kernel", [&]() {
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return static_cast<scalar_t>(static_cast<bool>(a) && static_cast<bool>(b));
+        }),
+        static_cast<scalar_t>(true));
+  });
 }
 
 void or_kernel_cuda(TensorIterator& iter) {
-  gpu_reduce_kernel<uint8_t, uint8_t>(
-    iter, func_wrapper<uint8_t> ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t {
-      return a || b;
-    }), false);
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "or_kernel", [&]() {
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return static_cast<scalar_t>(static_cast<bool>(a) || static_cast<bool>(b));
+        }),
+        static_cast<scalar_t>(false));
+  });
 }
 
 REGISTER_DISPATCH(and_stub, &and_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index c6688b286914..889ccf606152 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -250,7 +250,7 @@ void kthvalue_cuda_template(
     int64_t dim_,
     bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  int64_t slicesize = self.size(dim);
+  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
   // FIXME: This seems bogus, I only do this because it was the old behaviour.
   //        The reductions are fine, as long as the axis being reduced along
   //        isn't of 0 elements (and the output has elements).
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
index e56845fd1f9e..bbcbfe10fd01 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@@ -7,7 +7,6 @@
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
 #include <ATen/ATen.h>
-#include <ATen/TypeDefault.h>
 #import <ATen/native/metal/mpscnn/tests/MPSCNNTests.h>
 
 #include <stdlib.h>
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0324ed352c95..2dcd3d234e46 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7117,358 +7117,6 @@
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
-- func: _foreach_abs(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_abs_slow
-    CUDA: foreach_tensor_abs_cuda
-
-- func: _foreach_abs_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_abs_slow_
-    CUDA: foreach_tensor_abs_cuda_
-
-- func: _foreach_acos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_acos_slow
-    CUDA: foreach_tensor_acos_cuda
-
-- func: _foreach_acos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_acos_slow_
-    CUDA: foreach_tensor_acos_cuda_
-
-- func: _foreach_asin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_asin_slow
-    CUDA: foreach_tensor_asin_cuda
-
-- func: _foreach_asin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_asin_slow_
-    CUDA: foreach_tensor_asin_cuda_
-
-- func: _foreach_atan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_atan_slow
-    CUDA: foreach_tensor_atan_cuda
-
-- func: _foreach_atan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_atan_slow_
-    CUDA: foreach_tensor_atan_cuda_
-
-- func: _foreach_ceil(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_ceil_slow
-    CUDA: foreach_tensor_ceil_cuda
-
-- func: _foreach_ceil_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_ceil_slow_
-    CUDA: foreach_tensor_ceil_cuda_
-
-- func: _foreach_cos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_cos_slow
-    CUDA: foreach_tensor_cos_cuda
-
-- func: _foreach_cos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_cos_slow_
-    CUDA: foreach_tensor_cos_cuda_
-
-- func: _foreach_cosh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_cosh_slow
-    CUDA: foreach_tensor_cosh_cuda
-
-- func: _foreach_cosh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_cosh_slow_
-    CUDA: foreach_tensor_cosh_cuda_
-
-- func: _foreach_erf(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_erf_slow
-    CUDA: foreach_tensor_erf_cuda
-
-- func: _foreach_erf_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_erf_slow_
-    CUDA: foreach_tensor_erf_cuda_
-
-- func: _foreach_erfc(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_erfc_slow
-    CUDA: foreach_tensor_erfc_cuda
-
-- func: _foreach_erfc_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_erfc_slow_
-    CUDA: foreach_tensor_erfc_cuda_
-
-- func: _foreach_expm1(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_expm1_slow
-    CUDA: foreach_tensor_expm1_cuda
-
-- func: _foreach_expm1_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_expm1_slow_
-    CUDA: foreach_tensor_expm1_cuda_
-
-- func: _foreach_floor(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_floor_slow
-    CUDA: foreach_tensor_floor_cuda
-
-- func: _foreach_floor_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_floor_slow_
-    CUDA: foreach_tensor_floor_cuda_
-
-- func: _foreach_log(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log_slow
-    CUDA: foreach_tensor_log_cuda
-
-- func: _foreach_log_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log_slow_
-    CUDA: foreach_tensor_log_cuda_
-
-- func: _foreach_log10(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log10_slow
-    CUDA: foreach_tensor_log10_cuda
-
-- func: _foreach_log10_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log10_slow_
-    CUDA: foreach_tensor_log10_cuda_
-
-- func: _foreach_log1p(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log1p_slow
-    CUDA: foreach_tensor_log1p_cuda
-
-- func: _foreach_log1p_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log1p_slow_
-    CUDA: foreach_tensor_log1p_cuda_
-
-- func: _foreach_log2(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log2_slow
-    CUDA: foreach_tensor_log2_cuda
-
-- func: _foreach_log2_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_log2_slow_
-    CUDA: foreach_tensor_log2_cuda_
-
-- func: _foreach_neg(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_neg_slow
-    CUDA: foreach_tensor_neg_cuda
-
-- func: _foreach_neg_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_neg_slow_
-    CUDA: foreach_tensor_neg_cuda_
-
-- func: _foreach_tan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_tan_slow
-    CUDA: foreach_tensor_tan_cuda
-
-- func: _foreach_tan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_tan_slow_
-    CUDA: foreach_tensor_tan_cuda_
-
-- func: _foreach_tanh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_tanh_slow
-    CUDA: foreach_tensor_tanh_cuda
-
-- func: _foreach_tanh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_tanh_slow_
-    CUDA: foreach_tensor_tanh_cuda_
-
-- func: _foreach_sin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sin_slow
-    CUDA: foreach_tensor_sin_cuda
-
-- func: _foreach_sin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sin_slow_
-    CUDA: foreach_tensor_sin_cuda_
-
-- func: _foreach_sinh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sinh_slow
-    CUDA: foreach_tensor_sinh_cuda
-
-- func: _foreach_sinh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sinh_slow_
-    CUDA: foreach_tensor_sinh_cuda_
-
-- func: _foreach_round(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_round_slow
-    CUDA: foreach_tensor_round_cuda
-
-- func: _foreach_round_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_round_slow_
-    CUDA: foreach_tensor_round_cuda_
-
-- func: _foreach_lgamma(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_lgamma_slow
-    CUDA: foreach_tensor_lgamma_cuda
-
-- func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_lgamma_slow_
-    CUDA: foreach_tensor_lgamma_cuda_
-
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   use_c10_dispatcher: full
   device_guard: False
diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp
index 6a781c3ab69b..18df3ae818f3 100644
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp
@@ -58,7 +58,7 @@ Tensor empty(
     const TensorOptions& options,
     const optional<MemoryFormat> memory_format) {
   TORCH_CHECK(
-      !options.has_pinned_memory(),
+      !options.pinned_memory(),
       "'pin_memory' argument is incompatible with Vulkan tensor");
   TORCH_CHECK(
       !options.has_memory_format() && !memory_format,
@@ -519,6 +519,7 @@ Tensor mean(
     const IntArrayRef dim,
     const bool keepdim,
     const optional<ScalarType> dtype) {
+  TORCH_INTERNAL_ASSERT(!keepdim, "keepdim not implemented for Vulkan mean");
   TORCH_INTERNAL_ASSERT(self.is_vulkan(), "mean expects Vulkan tensor input");
 
   // Mean is implemented only for HW dimensions of 4-d tensor
@@ -541,7 +542,7 @@ Tensor mean(
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("slice.Tensor", TORCH_FN(at::native::vulkan::aten::slice));
-  m.impl("reshape", TORCH_FN(at::native::vulkan::aten::reshape));
+  m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape));
   m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select));
   m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose));
   m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_);
@@ -567,8 +568,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar));
   m.impl_UNBOXED(
       "convolution_overrideable", at::native::vulkan::aten::convolution);
-  m.impl_UNBOXED("hardtanh_", at::native::vulkan::aten::hardtanh_);
-  m.impl_UNBOXED("relu_", at::native::vulkan::aten::relu_);
+  m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_);
+  m.impl("relu_", at::native::vulkan::aten::relu_);
   m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_);
 }
 
diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp
index f65e6b3336f5..8ad79a0c6f31 100644
--- a/aten/src/ATen/native/vulkan/VulkanOps.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp
@@ -29,7 +29,6 @@ void upsample_nearest2d(
     float scaleH,
     float scaleW) {
   auto device = context().device();
-  auto physicalDevice = context().physicalDevice();
   int64_t C = IN * IC;
   struct ConstBlock {
     float scaleX;
@@ -477,7 +476,6 @@ void add(
   auto W = os4[3];
 
   auto device = context().device();
-  auto physicalDevice = context().physicalDevice();
   struct ConstBlock {
     float alpha;
   };
@@ -1115,10 +1113,8 @@ void clamp(
   auto C = sizes[0] * sizes[1];
   auto H = sizes[2];
   auto W = sizes[3];
-  auto C_4 = UP_DIV(C, 4);
 
   auto device = context().device();
-  auto physicalDevice = context().physicalDevice();
   struct ConstBlock {
     float min;
     float max;
@@ -1170,14 +1166,10 @@ void addmm(
   const auto m2Sizes = m2.sizes();
   TORCH_INTERNAL_ASSERT(m1Sizes.size() == 2);
   TORCH_INTERNAL_ASSERT(m2Sizes.size() == 2);
-  const auto m1H = m1Sizes[0];
   const auto m1W = m1Sizes[1];
   const auto m1C = 1;
-  const auto m1C_4 = UP_DIV(m1C, 4);
   const auto m2H = m2Sizes[0];
-  const auto m2W = m2Sizes[1];
   const auto m2C = 1;
-  const auto m2C_4 = UP_DIV(m2C, 4);
   const auto OH = m1Sizes[0];
   const auto OW = m2Sizes[1];
 
@@ -1186,7 +1178,6 @@ void addmm(
 
   const auto C = m1C;
   const auto C_4 = UP_DIV(C, 4);
-  const auto K = m1W;
 
   auto device = context().device();
 
@@ -1206,15 +1197,14 @@ void addmm(
         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
     };
   } else {
     descriptorTypes = {
         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
     };
   }
 
@@ -1228,9 +1218,9 @@ void addmm(
   output.image()->bindStorageImage(descriptorSet, 0);
   m1.image()->bindShaderRead(descriptorSet, 1);
   m2.image()->bindShaderRead(descriptorSet, 2);
-  constBuffer.bind(descriptorSet, 3);
   if (hasT) {
-    (*t).image()->bindShaderRead(descriptorSet, 4);
+    (*t).image()->bindShaderRead(descriptorSet, 3);
+    constBuffer.bind(descriptorSet, 4);
   }
 
   WorkGroupSize workGroupSize{8, 8, 1};
@@ -1268,17 +1258,13 @@ void mean(VulkanTensor& output, const VulkanTensor& input) {
   int32_t C = safe_downcast<int32_t>(isizes[1]);
   int32_t H = safe_downcast<int32_t>(isizes[2]);
   int32_t W = safe_downcast<int32_t>(isizes[3]);
-  int32_t C_4 = UP_DIV(N * C, 4);
 
   auto device = context().device();
-  auto physicalDevice = context().physicalDevice();
   struct ConstBlock {
     int32_t W;
     int32_t H;
-    int32_t OW;
-    int32_t OH;
   };
-  ConstBlock cb{W, H, C, N};
+  ConstBlock cb{W, H};
   VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb));
 
   VkDescriptorSetLayout descriptorSetLayout{};
@@ -1301,12 +1287,12 @@ void mean(VulkanTensor& output, const VulkanTensor& input) {
 
   WorkGroupSize workGroupSize{1, 1, 1};
   auto& computeUnit = context().computeUnitFactory().get(
-      GLSL_SPV(mean), descriptorSetLayout, workGroupSize);
+      GLSL_SPV(mean2d), descriptorSetLayout, workGroupSize);
   computeUnit.createCommandBuffer(descriptorSet);
   auto commandBuffer = computeUnit.commandBuffer();
   output.image()->addImageMemoryBarrierToGeneral(commandBuffer);
   input.image()->addImageMemoryBarrierToShaderRead(commandBuffer);
-  computeUnit.dispatchCommandBuffer(1, 1, C_4, workGroupSize);
+  computeUnit.dispatchCommandBuffer(C, N, 1, workGroupSize);
   computeUnit.endCommandBuffer();
   computeUnit.submitAndWaitCommandBuffer();
   vkDestroyDescriptorPool(device, descriptorPool, nullptr);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index 1c6f5d98fc21..c2962844e0bc 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -17,6 +17,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 stride;
   ivec2 padding;
   vec2 clamp;
+  int W;
 } uBlock;
 
 layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;
@@ -38,10 +39,37 @@ void main() {
       const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z), 0);
       const ivec4 kz = block + 4 * z;
 
-      sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, 0, kz.x), 0), sum);
-      sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, 0, kz.y), 0), sum);
-      sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, 0, kz.z), 0), sum);
-      sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, 0, kz.w), 0), sum);
+      const int W = uBlock.W;
+
+      const vec4 val1 = vec4(
+          texelFetch(uKernel, ivec3((4*kz.x+0)%W, ((4*kz.x+0))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.x+1)%W, ((4*kz.x+1))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.x+2)%W, ((4*kz.x+2))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.x+3)%W, ((4*kz.x+3))/W, 0), 0).x
+      );
+      const vec4 val2 = vec4(
+          texelFetch(uKernel, ivec3((4*kz.y+0)%W, ((4*kz.y+0))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.y+1)%W, ((4*kz.y+1))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.y+2)%W, ((4*kz.y+2))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.y+3)%W, ((4*kz.y+3))/W, 0), 0).x
+      );
+      const vec4 val3 = vec4(
+          texelFetch(uKernel, ivec3((4*kz.z+0)%W, ((4*kz.z+0))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.z+1)%W, ((4*kz.z+1))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.z+2)%W, ((4*kz.z+2))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.z+3)%W, ((4*kz.z+3))/W, 0), 0).x
+      );
+      const vec4 val4 = vec4(
+          texelFetch(uKernel, ivec3((4*kz.w+0)%W, ((4*kz.w+0))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.w+1)%W, ((4*kz.w+1))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.w+2)%W, ((4*kz.w+2))/W, 0), 0).x,
+          texelFetch(uKernel, ivec3((4*kz.w+3)%W, ((4*kz.w+3))/W, 0), 0).x
+      );
+
+      sum = fma(In.xxxx, val1, sum);
+      sum = fma(In.yyyy, val2, sum);
+      sum = fma(In.zzzz, val3, sum);
+      sum = fma(In.wwww, val4, sum);
     }
 
     imageStore(
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index d56bbb2f4f82..6aec84d8b349 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -120,11 +120,37 @@ Tensor& clamp_(
   return self_arg;
 }
 
+Tensor hardtanh(
+    const Tensor& self,
+    const Scalar min,
+    const Scalar max) {
+  return ops::clamp(self, min, max);
+}
+
+Tensor& hardtanh_(
+    Tensor& self,
+    const Scalar min,
+    const Scalar max) {
+  return ops::clamp_(self, min, max);
+}
+
+Tensor relu(const Tensor& self) {
+  return ops::clamp(self, 0, c10::nullopt);
+}
+
+Tensor& relu_(Tensor& self) {
+  return ops::clamp_(self, 0, c10::nullopt);
+}
+
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("clamp", TORCH_FN(clamp));
   m.impl("clamp_", TORCH_FN(clamp_));
+  m.impl_UNBOXED("hardtanh", hardtanh);
+  m.impl_UNBOXED("hardtanh_", hardtanh_);
+  m.impl_UNBOXED("relu", relu);
+  m.impl_UNBOXED("relu_", relu_);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 4dd85f004c5e..5bec92abb53d 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -494,11 +494,31 @@ void conv2d_pointwise(
   using namespace api::utils;
 
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
+    
+    vTensor v_weight_reshaped{
+        context,
+        {1,1, v_weight.sizes()[0], v_weight.sizes()[1]},
+        v_input.options(),
+    };
+
+    api::Command::Buffer temp_command_buffer =
+        api::context()->command().pool.allocate();
+    temp_command_buffer.begin();
+
+    temp_command_buffer.copy(
+        v_weight.buffer(temp_command_buffer),
+        v_weight_reshaped.buffer(temp_command_buffer, vTensor::Access::Write)
+    );
+
+    temp_command_buffer.end();
+    temp_command_buffer.submit(api::context()->gpu().queue);
+
     const struct {
       int32_t kernel_ic, kernel_oc;
       int32_t stride_x, stride_y;
       int32_t padding_x, padding_y;
       float clamp_x, clamp_y;
+      int32_t w;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::input]),
       safe_downcast<int32_t>(filter[Layout::Filter::output]),
@@ -508,6 +528,7 @@ void conv2d_pointwise(
       safe_downcast<int32_t>(padding[Layout::Parameter::height]),
       output_min,
       output_max,
+      v_weight.sizes()[1],
     };
 
     context->dispatch(
@@ -529,7 +550,7 @@ void conv2d_pointwise(
         v_input.image(command_buffer),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
-        v_weight.image(command_buffer),
+        v_weight_reshaped.image(command_buffer, vTensor::Access::Read),
         // Read-only access is implied on const tensors and triggers an async
         // synchronization if necessary.
         v_bias.buffer(command_buffer),
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index f15ef15969aa..185f66226e15 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -28,14 +28,26 @@ Tensor addmm(
   const auto mat1_sizes = mat1.sizes();
   const auto mat2_sizes = mat2.sizes();
 
-  TORCH_CHECK(
-      (mat1_sizes[Layout::Parameter::width] ==
-          mat2_sizes[Layout::Parameter::height]) &&
-      (self_sizes[Layout::Parameter::height] ==
-          mat1_sizes[Layout::Parameter::height]) &&
-      (self_sizes[Layout::Parameter::width] ==
-          mat2_sizes[Layout::Parameter::width]),
-      "Incompatible matrix dimensions!");
+  if (self_sizes.size() >= 2) {
+    TORCH_CHECK(
+        (mat1_sizes[Layout::Parameter::width] ==
+            mat2_sizes[Layout::Parameter::height]) &&
+        (self_sizes[Layout::Parameter::height] ==
+            mat1_sizes[Layout::Parameter::height]) &&
+        (self_sizes[Layout::Parameter::width] ==
+            mat2_sizes[Layout::Parameter::width]),
+        "Incompatible matrix dimensions!");
+  }
+  else {
+    TORCH_CHECK(
+        (mat1_sizes[Layout::Parameter::width] ==
+            mat2_sizes[Layout::Parameter::height]) &&
+        ((self_sizes[Layout::Parameter::height] ==
+            mat1_sizes[Layout::Parameter::height]) ||
+         (self_sizes[Layout::Parameter::height] ==
+            mat2_sizes[Layout::Parameter::width])),
+        "Incompatible matrix dimensions!");
+  }
 
   vTensor v_output{
     context,
diff --git a/aten/src/ATen/test/vec256_test_all_types.h b/aten/src/ATen/test/vec256_test_all_types.h
index 3226af8422d1..353f1e2c2b58 100644
--- a/aten/src/ATen/test/vec256_test_all_types.h
+++ b/aten/src/ATen/test/vec256_test_all_types.h
@@ -1211,22 +1211,7 @@ std::enable_if_t<!is_complex<T>::value, T> local_sqrt(T x) {
 
 template <typename T>
 std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_sqrt(Complex<T> x) {
-#if defined(TEST_AGAINST_DEFAULT)
     return std::sqrt(x);
-#else 
-    PreventFma noFma;
-    // sqrt(2) / 2 * [sqrt(abs() + a) + sgn(b) * sqrt(abs() - a)i]
-    T real = x.real();
-    T imag = x.imag();
-    T abs = local_abs(x).real();
-    T sqrt2_2 = std::sqrt(static_cast<T>(2)) / static_cast<T>(2);
-    T abs_r = noFma.add(abs, real);
-    T abs_i = noFma.sub(abs, real);
-    T res_r = sqrt2_2 * std::sqrt(abs_r);
-    T res_i = sqrt2_2 * std::sqrt(abs_i);
-    if (std::signbit(imag)) res_i = -res_i;
-    return Complex<T>(res_r, res_i);
-#endif
 }
 
 template <typename T>
@@ -1236,26 +1221,7 @@ std::enable_if_t<!is_complex<T>::value, T> local_asin(T x) {
 
 template <typename T>
 std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_asin(Complex<T> x) {
-#if defined(TEST_AGAINST_DEFAULT)
     return std::asin(x);
-#else
-    // asin(x)
-    // = -i*ln(iz + sqrt(1 -z^2))
-    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
-    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
-    PreventFma noFma;
-    T a = x.real();
-    T b = x.imag();
-    T aa = a * a;
-    T bb = b * b;
-    T _ab = a * (-b);
-    T _2ab = noFma.add(_ab, _ab);
-    T aa_bb = static_cast<T>(1) - noFma.sub(aa, bb); // 1 - (a*a-b*b)
-    Complex<T> temp = Complex<T>(-b, a) + local_sqrt(Complex<T>(aa_bb, _2ab));
-    auto ln = std::log(temp);
-    //-i*ln() => -i * ln => (ln.imag, -ln.real)
-    return Complex<T>(ln.imag(), -ln.real());
-#endif
 }
 
 template <typename T>
@@ -1265,13 +1231,7 @@ std::enable_if_t<!is_complex<T>::value, T> local_acos(T x) {
 
 template <typename T>
 std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_acos(Complex<T> x) {
-#if defined(TEST_AGAINST_DEFAULT)
     return std::acos(x);
-#else
-    // pi/2 - asin(x) 
-    auto half_pi = static_cast<T>(M_PI) / static_cast<T>(2);
-    return Complex<T>(half_pi, 0) - local_asin(x);
-#endif
 }
 
 template<typename T>
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 69894a9fde5d..03105eec90ea 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -164,6 +164,33 @@ TEST(VulkanAPITest, addmm) {
   ASSERT_TRUE(check);
 }
 
+TEST(VulkanAPITest, addmm_expand) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  constexpr float alpha = 2.1f;
+  constexpr float beta = 103.24;
+
+  const auto bias_cpu = at::rand({1000}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto m1_cpu = at::rand({1, 1280}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto m2_cpu = at::rand({1280, 1000}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto out_cpu = at::addmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha);
+
+  const auto bias_vulkan = bias_cpu.vulkan();
+  const auto m1_vulkan = m1_cpu.vulkan();
+  const auto m2_vulkan = m2_cpu.vulkan();
+  const auto out_vulkan = at::addmm(bias_vulkan, m1_vulkan, m2_vulkan, beta, alpha);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    std::cout << "Expected:\n" << out_cpu << std::endl;
+    std::cout << "Got:\n" << out_vulkan.cpu() << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST(VulkanAPITest, avg_pool2d) {
   if (!at::is_vulkan_available()) {
     return;
@@ -634,6 +661,268 @@ TEST(VulkanAPITest, upsample_nearest2d) {
   ASSERT_TRUE(check);
 }
 
+enum class OpType {
+  addmm,
+  conv2d,
+  hardtanh_,
+  mean,
+ };
+
+class BaseOp {
+ public:
+  explicit BaseOp(const OpType type) : type_(type) {}
+  virtual ~BaseOp() = default;
+
+  virtual at::Tensor run(at::Tensor&) const = 0;
+  virtual std::string toString() const = 0;
+
+ private:
+  OpType type_;
+};
+
+class Addmm final : public BaseOp {
+ public:
+  Addmm(
+      const int64_t m1H,
+      const int64_t m1W,
+      const int64_t m2W,
+      const float beta,
+      const float alpha)
+    : BaseOp(OpType::addmm),
+      m2_(at::rand(c10::IntArrayRef({m1W, m2W}), at::device(at::kCPU).dtype(at::kFloat))),
+      v_m2(m2_.vulkan()),
+      b_(at::rand(c10::IntArrayRef({m1H, m2W}), at::device(at::kCPU).dtype(at::kFloat))),
+      v_b_(b_.vulkan()),
+      beta_(beta),
+      alpha_(alpha) {
+  }
+
+  at::Tensor run(at::Tensor& t) const override {
+    if (t.is_vulkan()) {
+      return at::addmm(v_b_, t, v_m2, beta_, alpha_);
+    }
+
+    return at::addmm(b_, t, m2_, beta_, alpha_);
+  }
+
+  std::string toString() const override {
+    return "addmm";
+  }
+
+ private:
+  at::Tensor m2_;
+  at::Tensor v_m2;
+  at::Tensor b_;
+  at::Tensor v_b_;
+  float beta_;
+  float alpha_;
+};
+
+class Conv2d final : public BaseOp {
+ public:
+  Conv2d(
+      const c10::IntArrayRef wsizes,
+      const int64_t groups,
+      const int64_t stride,
+      const int64_t padding)
+      : BaseOp(OpType::conv2d),
+        groups_(groups),
+        stride_(stride),
+        padding_(padding),
+        w_(at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat))),
+        b_(at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){
+  }
+
+  at::Tensor run(at::Tensor& t) const override {
+    return at::conv2d(t, w_, b_, {stride_}, {padding_}, {1}, groups_);
+  }
+
+  std::string toString() const override {
+    return "conv2d";
+  }
+
+ private:
+  int64_t groups_;
+  int64_t stride_;
+  int64_t padding_;
+  at::Tensor w_;
+  at::Tensor b_;
+};
+
+class Hardtanh_ final : public BaseOp {
+ public:
+  Hardtanh_() : BaseOp(OpType::hardtanh_) {}
+
+  at::Tensor run(at::Tensor& input) const override {
+    return at::hardtanh_(input, 0, 6);
+  }
+
+  std::string toString() const override {
+    return "hardtanh_";
+  }
+};
+
+class Mean final : public BaseOp {
+ public:
+  Mean() : BaseOp(OpType::mean) {}
+
+  at::Tensor run(at::Tensor& input) const override {
+    return at::mean(input, {2, 3}, false);
+  }
+
+  std::string toString() const override {
+    return "mean";
+  }
+};
+
+class OpsList {
+ public:
+  OpsList() {}
+  explicit OpsList(std::vector<std::unique_ptr<BaseOp>> ops)
+    : ops_(std::move(ops)) {
+  }
+
+  auto run(const at::Tensor& input) {
+    at::Tensor output = input;
+
+    for (const auto& op : ops_) {
+      output = op->run(output);
+    }
+
+    return output;
+  }
+
+  auto run(const at::Tensor& input, const at::Tensor& v_input) {
+    at::Tensor output = input;
+    at::Tensor v_output = v_input;
+
+    for (const auto& op : ops_) {
+      output = op->run(output);
+      v_output = op->run(v_output);
+    }
+
+    return std::make_pair(output, v_output);
+  }
+
+ protected:
+  std::vector<std::unique_ptr<BaseOp>> ops_;
+};
+
+class MobileNetV2 final : public OpsList {
+ public:
+  MobileNetV2() {
+    ops_.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({24, 144, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 2, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({32, 144, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 2, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({64, 192, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({96, 384, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 2, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({160, 576, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Conv2d({320, 960, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Conv2d({1280, 320, 1, 1}, 1, 1, 0));
+    ops_.emplace_back(new Hardtanh_());
+    ops_.emplace_back(new Mean());
+    ops_.emplace_back(new Addmm(1, 1280, 1000, 0, 1));
+  }
+};
+
+TEST(VulkanAPITest, mobilenetv2) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  MobileNetV2 mn2;
+
+  const auto input = at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto output = mn2.run(input, input.vulkan());
+
+  const auto check = almostEqual(output.first, output.second.cpu());
+  if (!check) {
+    std::cout << "Expected:\n" << output.first << std::endl;
+    std::cout << "Got:\n" << output.second.cpu() << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
 } // namespace
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp
index c8d1b72cc06b..d5483a7327b1 100644
--- a/aten/src/ATen/test/vulkan_test.cpp
+++ b/aten/src/ATen/test/vulkan_test.cpp
@@ -45,7 +45,12 @@ TEST(VulkanTest, upsampleNearest2D) {
   auto t_out =
       tv_out.to(at::TensorOptions{at::Device{at::kCPU}}.dtype(at::kFloat));
 
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
+  bool check = almostEqual(t_out_expected, t_out);
+  if (!check) {
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
 }
 
 TEST(VulkanTest, add) {
@@ -208,7 +213,12 @@ TEST(VulkanTest, conv2dDWWeightsOnCPU) {
   auto tv_in = t_in.vulkan();
   auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups);
   auto t_out = tv_out.cpu();
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
+  bool check = almostEqual(t_out_expected, t_out);
+  if (!check) {
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
 }
 
 TEST(VulkanTest, addmm) {
@@ -227,7 +237,12 @@ TEST(VulkanTest, addmm) {
   auto tv_b = t_b.vulkan();
   auto tv_out = at::addmm(tv_b, tv_m1, tv_m2, beta, alpha);
   auto t_out = tv_out.cpu();
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
+  bool check = almostEqual(t_out_expected, t_out);
+  if (!check) {
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
 }
 
 TEST(VulkanTest, mm) {
@@ -242,7 +257,12 @@ TEST(VulkanTest, mm) {
   auto tv_m2 = t_m2.vulkan();
   auto tv_out = tv_m1.mm(tv_m2);
   auto t_out = tv_out.cpu();
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
+  bool check = almostEqual(t_out_expected, t_out);
+  if (!check) {
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
 }
 
 TEST(VulkanTest, clamp) {
@@ -301,7 +321,12 @@ TEST(VulkanTest, mean) {
   auto tv_in = t_in.vulkan();
   auto tv_out = at::mean(tv_in, {2, 3}, false);
   auto t_out = tv_out.cpu();
-  ASSERT_TRUE(almostEqual(t_out, t_out_expected));
+  bool check = almostEqual(t_out_expected, t_out);
+  if (!check) {
+    std::cout << "expected:\n" << t_out_expected << std::endl;
+    std::cout << "got:\n" << t_out << std::endl;
+  }
+  ASSERT_TRUE(check);
 }
 
 enum class OpType { conv2d, hardtanh_, mean, addmm };
@@ -874,7 +899,7 @@ TEST(VulkanTest, cat) {
   ASSERT_TRUE(check);
 }
 
-TEST(VulkanTest, max_pool2d) {
+TEST(VulkanTest, DISABLED_max_pool2d) {
   if (!at::is_vulkan_available())
     return;
 
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index 495853e21c34..de56e00fa225 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -107,7 +107,7 @@ def __init__(self, op_bench, test_config):
         self.test_config = test_config
         self.framework = "Caffe2"
 
-    def run_forward(self, num_runs, print_per_iter=False, cubda_sync=False):
+    def run_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """ Run the forward path of an operator in a loop
         """
         with core.DeviceScope(self.op_bench.dev):
@@ -115,7 +115,7 @@ def run_forward(self, num_runs, print_per_iter=False, cubda_sync=False):
         if not workspace.RunOperatorMultiple(op, num_runs):
             raise ValueError("Unable to run operator test case: {}".format(self.test_name))
 
-    def run_backward(self, num_runs):
+    def run_backward(self, num_runs, print_per_iter=False):
         """ Run the backward path of an operator in a loop
         """
         with core.DeviceScope(self.op_bench.dev):
diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py
new file mode 100644
index 000000000000..7f4c440c2c39
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py
@@ -0,0 +1,39 @@
+import operator_benchmark as op_bench
+import torch
+
+tensor_conversion_short_configs = op_bench.cross_product_configs(
+    M=(8, 16, 32,),
+    N=(16, 64, 128,),
+    device=['cpu', 'cuda'],
+    tags=['short'],
+)
+
+tensor_conversion_long_configs = op_bench.cross_product_configs(
+    M=(64, 128, 256, 512,),
+    N=(256, 512, 1024, 2048,),
+    device=['cpu', 'cuda'],
+    tags=['long'],
+)
+
+class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, device):
+        self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.float)
+
+    def forward(self):
+        return self.input.to(torch.half)
+
+class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, device):
+        self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.half)
+
+    def forward(self):
+        return self.input.to(torch.float)
+
+
+op_bench.generate_pt_test(tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/c10/test/util/Metaprogramming_test.cpp b/c10/test/util/Metaprogramming_test.cpp
index 0f55814bf6f5..88c8e0facad1 100644
--- a/c10/test/util/Metaprogramming_test.cpp
+++ b/c10/test/util/Metaprogramming_test.cpp
@@ -243,14 +243,36 @@ namespace test_tuple_take {
 
   TEST(MetaprogrammingTest, TupleTake_nonemptyPrefix) {
     auto x = std::make_tuple(0, "HEY", 2.0);
-    auto y = tuple_take<std::tuple<int, const char*, double>, 2>(x);
+    auto y = tuple_take<decltype(x), 2>(x);
     auto z = std::make_tuple(0, "HEY");
     EXPECT_EQ(y, z);
   }
 
   TEST(MetaprogrammingTest, TupleTake_fullPrefix) {
     auto x = std::make_tuple(0, "HEY", 2.0);
-    auto y = tuple_take<std::tuple<int, const char*, double>, 3>(x);
+    auto y = tuple_take<decltype(x), 3>(x);
+    EXPECT_EQ(x, y);
+  }
+
+  TEST(MetaprogrammingTest, TupleTake_negative) {
+    auto x = std::make_tuple(0, "HEY", 2.0);
+    auto y = tuple_take<decltype(x), -2>(x);
+    auto z = std::make_tuple("HEY", 2.0);
+    EXPECT_EQ(y, z);
+  }
+}
+
+namespace test_tuple_slice {
+  TEST(MetaprogrammingTest, TupleSlice_middle) {
+    auto x = std::make_tuple(0, "HEY", 2.0, false);
+    auto y = tuple_slice<decltype(x), 1, 2>(x);
+    auto z = std::make_tuple("HEY", 2.0);
+    EXPECT_EQ(y, z);
+  }
+
+  TEST(MetaprogrammingTest, TupleSlice_full) {
+    auto x = std::make_tuple(0, "HEY", 2.0);
+    auto y = tuple_slice<decltype(x), 0, 3>(x);
     EXPECT_EQ(x, y);
   }
 }
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index 9329ab3b854c..cc11122e8af7 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -258,6 +258,11 @@ struct _if_constexpr<false> final {
  * Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
  *       If func() had a return type of "auto", then both branches could return different
  *       types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
+ *
+ * Note: if_constexpr<cond, t, f> is *eager* w.r.t. template expansion - meaning this
+ *       polyfill does not behave like a true "if statement at compilation time".
+ *       The `_` trick above only defers typechecking, which happens after templates
+ *       have been expanded. (Of course this is all that's necessary for many use cases).
  */
 template<bool Condition, class ThenCallback, class ElseCallback>
 decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h
index ee5252097377..ae929a93ca09 100644
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@@ -130,6 +130,30 @@ decltype(auto) filter_map(const Mapper& mapper, Args&&... args) {
 }
 
 
+/**
+ * make_offset_index_sequence<Start, N>
+ * Like make_index_sequence<N>, but starting from Start instead of 0.
+ *
+ * Example:
+ *  make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
+ */
+template<size_t Start, size_t N, size_t... Is>
+struct make_offset_index_sequence_impl
+  : make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...>
+{
+  static_assert(static_cast<int>(Start) >= 0, "make_offset_index_sequence: Start < 0");
+  static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
+};
+
+template<size_t Start, size_t... Is>
+struct make_offset_index_sequence_impl <Start, 0, Is...> {
+  typedef std::index_sequence<Is...> type;
+};
+
+template<size_t Start, size_t N>
+using make_offset_index_sequence = typename make_offset_index_sequence_impl<Start, N>::type;
+
+
 /**
  * Use tuple_elements to extract a position-indexed subset of elements
  * from the argument tuple into a result tuple.
@@ -138,22 +162,58 @@ decltype(auto) filter_map(const Mapper& mapper, Args&&... args) {
  *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
  *  std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0, 2>());
  */
-template <class Tuple, size_t... ns>
-constexpr auto tuple_elements(Tuple t, std::index_sequence<ns...>) {
-  return std::tuple<std::tuple_element_t<ns, Tuple>...>(std::get<ns>(t)...);
+template <class Tuple, size_t... Is>
+constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...>) {
+  return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
 }
 
 /**
- * Use tuple_take to extract the first n elements from the argument tuple
- * into a result tuple.
+ * Use tuple_take to extract the first or last n elements from the argument
+ * tuple into a result tuple.
  *
  * Example:
  *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
- *  std::tuple<int, const char*> result = tuple_take<decltype(t), 2>(t);
+ *  std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
+ *  std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
+ */
+template <class Tuple, int N, class Enable = void>
+struct TupleTake {};
+
+template <class Tuple, int N>
+struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(N <= size, "tuple_take: N > size");
+    return tuple_elements(t, std::make_index_sequence<N>{});
+  }
+};
+
+template <class Tuple, int N>
+struct TupleTake<Tuple, N, std::enable_if_t<N < 0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(-N <= size, "tuple_take: -N > size");
+    return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
+  }
+};
+
+template <class Tuple, int N>
+auto tuple_take(Tuple t) {
+  return TupleTake<Tuple, N>::call(t);
+}
+
+/**
+ * Use tuple_slice to extract a contiguous subtuple from the argument.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double, bool> t = std::make_tuple(0, "HEY", 2.0, false);
+ *  std::tuple<int, const char*> middle_two = tuple_slice<decltype(t), 1, 2>(t);
  */
-template <class Tuple, size_t n>
-constexpr auto tuple_take(Tuple t) {
-  return tuple_elements(t, std::make_index_sequence<n>{});
+template <class Tuple, size_t Start, size_t N>
+constexpr auto tuple_slice(Tuple t) {
+  constexpr size_t size = std::tuple_size<Tuple>();
+  static_assert(Start + N <= size, "tuple_slice: Start + N > size");
+  return tuple_elements(t, make_offset_index_sequence<Start, N>{});
 }
 
 
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 453196510aa8..825b934852d4 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -5,6 +5,11 @@
 #include <atomic>
 #include <stdexcept>
 
+namespace pybind11 {
+template <typename, typename...>
+class class_;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@@ -14,6 +19,9 @@ namespace raw {
   namespace intrusive_ptr {
     inline void incref(intrusive_ptr_target * self);
   }
+
+  // constructor tag used by intrusive_ptr constructors
+  struct DontIncreaseRefcount {};
 }
 /**
  * intrusive_ptr<T> is an alternative to shared_ptr<T> that has better
@@ -182,6 +190,16 @@ class intrusive_ptr final {
   friend class intrusive_ptr;
   friend class weak_intrusive_ptr<TTarget, NullType>;
 
+  // Make pybind11::class_ be a friend class of intrusive_ptr, so that custom
+  // smart holder in pybind11 could access the private constructor of
+  // intrusive_ptr(T*) which took the ownership of the object. This is required
+  // by customer holder macro PYBIND11_DECLARE_HOLDER_TYPE, where it uses
+  // intrusive_ptr(TTarget*) to initialize and take ownership of the object. For
+  // details, see
+  // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers
+  template <typename, typename...>
+  friend class pybind11::class_;
+
   void retain_() {
     if (target_ != NullType::singleton()) {
       size_t new_refcount = ++target_->refcount_;
@@ -207,16 +225,37 @@ class intrusive_ptr final {
     target_ = NullType::singleton();
   }
 
+  // raw pointer constructors are not public because we shouldn't make
+  // intrusive_ptr out of raw pointers except from inside the make_intrusive(),
+  // reclaim() and weak_intrusive_ptr::lock() implementations.
+
   // This constructor will not increase the ref counter for you.
-  // This is not public because we shouldn't make intrusive_ptr out of raw
-  // pointers except from inside the make_intrusive() and
-  // weak_intrusive_ptr::lock() implementations
-  explicit intrusive_ptr(TTarget* target) noexcept : target_(target) {}
+  // We use the tagged dispatch mechanism to explicitly mark this constructor
+  // to not increase the refcount
+  explicit intrusive_ptr(TTarget* target, raw::DontIncreaseRefcount) noexcept
+      : target_(target) {}
+
+  // This constructor will increase the ref counter for you.
+  // This constructor will be used by the make_intrusive(), and also pybind11,
+  // which wrap the intrusive_ptr holder around the raw pointer and incref
+  // correspondingly (pybind11 requires raw pointer constructor to incref by
+  // default).
+  explicit intrusive_ptr(TTarget* target)
+      : intrusive_ptr(target, raw::DontIncreaseRefcount{}) {
+    if (target_ != NullType::singleton()) {
+      // We can't use retain_(), because we also have to increase weakcount
+      // and because we allow raising these values from 0, which retain_()
+      // has an assertion against.
+      ++target_->refcount_;
+      ++target_->weakcount_;
+    }
+  }
 
  public:
   using element_type = TTarget;
 
-  intrusive_ptr() noexcept : intrusive_ptr(NullType::singleton()) {}
+  intrusive_ptr() noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
   intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
     rhs.target_ = NullType::singleton();
@@ -347,19 +386,17 @@ class intrusive_ptr final {
    * passed in *must* have been created using intrusive_ptr::release().
    */
   static intrusive_ptr reclaim(TTarget* owning_ptr) {
-    return intrusive_ptr(owning_ptr);
+    return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
   }
 
+  /**
+   * Allocate a heap object with args and wrap it inside a intrusive_ptr and
+   * incref. This is a helper function to let make_intrusive() access private
+   * intrusive_ptr constructors.
+   */
   template <class... Args>
   static intrusive_ptr make(Args&&... args) {
-    auto result = intrusive_ptr(new TTarget(std::forward<Args>(args)...));
-    // We can't use retain_(), because we also have to increase weakcount
-    // and because we allow raising these values from 0, which retain_()
-    // has an assertion against.
-    ++result.target_->refcount_;
-    ++result.target_->weakcount_;
-
-    return result;
+    return intrusive_ptr(new TTarget(std::forward<Args>(args)...));
   }
 
   /**
@@ -590,17 +627,18 @@ class weak_intrusive_ptr final {
 
   intrusive_ptr<TTarget, NullType> lock() const noexcept {
     if (expired()) {
-      return intrusive_ptr<TTarget, NullType>(NullType::singleton());
+      return intrusive_ptr<TTarget, NullType>();
     } else {
       auto refcount = target_->refcount_.load();
       do {
         if (refcount == 0) {
           // Object already destructed, no strong references left anymore.
           // Return nullptr.
-          return intrusive_ptr<TTarget, NullType>(NullType::singleton());
+          return intrusive_ptr<TTarget, NullType>();
         }
       } while (!target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
-      return intrusive_ptr<TTarget, NullType>(target_);
+      return intrusive_ptr<TTarget, NullType>(
+          target_, raw::DontIncreaseRefcount{});
     }
   }
 
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 4d22c27e3c7f..06ef9001c40a 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -133,6 +133,50 @@
     ("aten::add_relu", datetime.date(2020, 10, 28)),
     ("aten::add_relu_", datetime.date(2020, 10, 28)),
     ("aten::hash", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_round", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_sinh", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_lgamma_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_lgamma", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log10", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_round", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_sin", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_sinh_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_tanh_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_abs_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_sin_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_tan", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_tan_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log2_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_tanh", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log10_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_neg_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log2", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log1p_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_abs", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_acos", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_acos_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_asin", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_asin_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_atan", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_atan_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_ceil", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_ceil_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_cos", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_cos_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_cosh", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_cosh_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_erf", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_erf_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_erfc", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_erfc_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_expm1", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_expm1_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_floor", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_floor_", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_log1p", datetime.date(2020, 11, 15)),
+    ("aten::_foreach_neg", datetime.date(2020, 11, 15)),
 ]
 
 def allow_listed(schema, allow_list):
diff --git a/test/cpp/rpc/e2e_test_base.h b/test/cpp/rpc/e2e_test_base.h
index 9d3ab71c0cfc..114284839858 100644
--- a/test/cpp/rpc/e2e_test_base.h
+++ b/test/cpp/rpc/e2e_test_base.h
@@ -28,7 +28,7 @@ class TestE2EBase : public ::testing::Test {
     autogradContainer = getDistAutogradContainer();
 
     // Setup server store.
-    store = std::make_shared<c10d::TCPStore>(
+    store = c10::make_intrusive<c10d::TCPStore>(
         serverAddress, 0, numWorkers, true, std::chrono::seconds(10));
 
     buildRpcAgent();
@@ -147,7 +147,7 @@ class TestE2EBase : public ::testing::Test {
   std::shared_ptr<RpcAgent> rpcAgent;
   static const size_t numIters;
   static const size_t numWorkers;
-  std::shared_ptr<c10d::Store> store;
+  c10::intrusive_ptr<c10d::Store> store;
   static const char* serverAddress;
 };
 
diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp
index b4901cdbcf4d..d5ba55a6379c 100644
--- a/test/cpp_extensions/cpp_c10d_extension.cpp
+++ b/test/cpp_extensions/cpp_c10d_extension.cpp
@@ -23,92 +23,92 @@ ProcessGroupTest::ProcessGroupTest(int rank, int size)
 
 ProcessGroupTest::~ProcessGroupTest() {}
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::broadcast(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
-  return std::make_shared<ProcessGroupTest::WorkTest>();
+  return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
-  return std::make_shared<ProcessGroupTest::WorkTest>();
+  return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support allreduce_coalesced");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::reduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support reduce");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support allgather");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(
     at::Tensor& outputBuffer,
     at::Tensor& inputBuffer,
     const AllgatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support allgather_base");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
     const BarrierOptions& opts) {
-  return std::make_shared<ProcessGroupTest::WorkTest>();
+  return c10::make_intrusive<ProcessGroupTest::WorkTest>();
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support gather");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::reduce_scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
   throw std::runtime_error("ProcessGroupTest does not support reduce_scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support send");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support recv");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::recvAnysource(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupTest::recvAnysource(
     std::vector<at::Tensor>& tensor,
     int tag) {
   throw std::runtime_error("ProcessGroupTest does not support recvAnysource");
 }
 
 std::shared_ptr<ProcessGroup> ProcessGroupTest::createProcessGroupTest(
-    const std::shared_ptr<::c10d::Store>& store,
+    const c10::intrusive_ptr<::c10d::Store>& store,
     int rank,
     int size,
     const std::chrono::duration<float>& timeout) {
diff --git a/test/cpp_extensions/cpp_c10d_extension.hpp b/test/cpp_extensions/cpp_c10d_extension.hpp
index d8dffcd20327..1773953629d5 100644
--- a/test/cpp_extensions/cpp_c10d_extension.hpp
+++ b/test/cpp_extensions/cpp_c10d_extension.hpp
@@ -41,67 +41,67 @@ class ProcessGroupTest : public ProcessGroup {
   explicit ProcessGroupTest(int rank = -1, int size = -1);
   virtual ~ProcessGroupTest();
 
-  std::shared_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag);
 
-  std::shared_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag);
 
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
       int tag);
 
   // Create a new ProcessGroupTest instance
   static std::shared_ptr<ProcessGroup> createProcessGroupTest(
-      const std::shared_ptr<::c10d::Store>& store,
+      const c10::intrusive_ptr<::c10d::Store>& store,
       int rank,
       int size,
       const std::chrono::duration<float>& timeout);
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index bd9a2bb32b89..48e874990a73 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -495,6 +495,50 @@ def forward(self, x):
 
         self.checkModule(M(), (torch.randn(5, 5),))
 
+    def test_prepare_scriptable_basic(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        t = torch.randn(5, 5)
+        m = SeluButReluWhenScripted()
+        sm = torch.jit.script(m)
+        eager_out = m(t)
+        script_out = sm(t)
+        self.assertNotEqual(eager_out, script_out)
+
+    def test_prepare_scriptable_iterable_modules(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                shared = SeluButReluWhenScripted()
+                self.sequential = nn.Sequential(
+                    SeluButReluWhenScripted(),
+                    SeluButReluWhenScripted(),
+                    nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()),
+                    shared,
+                )
+                self.module_list = nn.ModuleList([SeluButReluWhenScripted(),
+                                                  shared,
+                                                  SeluButReluWhenScripted()])
+
+            def forward(self, x):
+                for mod in self.module_list:
+                    x += mod(x)
+                x += self.sequential(x)
+                return x
+
+        t = torch.randn(5, 5)
+        m = M()
+        eager_out = m(t.clone())
+        sm = torch.jit.script(m)
+        script_out = sm(t.clone())
+        self.assertNotEqual(eager_out, script_out)
+
     def test_attributes(self):
         @torch.jit.script
         class Inner2(object):
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index df482403f6c7..c1ca50270197 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -62,6 +62,32 @@ def f():
             return ss1.pop() + ss2.pop()
         test_equality(f, lambda x: x)
 
+        # test nn module with prepare_scriptable function
+        class NonJitableClass(object):
+            def __init__(self, int1, int2):
+                self.int1 = int1
+                self.int2 = int2
+
+            def return_vals(self):
+                return self.int1, self.int2
+
+        class CustomWrapper(torch.nn.Module):
+            def __init__(self, foo):
+                super(CustomWrapper, self).__init__()
+                self.foo = foo 
+
+            def forward(self) -> None:
+                self.foo.increment(1)
+                return
+
+            def __prepare_scriptable__(self):
+                int1, int2 = self.foo.return_vals()
+                foo = torch.classes._TorchScriptTesting._Foo(int1, int2)
+                return CustomWrapper(foo) 
+
+        foo = CustomWrapper(NonJitableClass(1, 2))
+        jit_foo = torch.jit.script(foo)
+
     def test_torchbind_take_as_arg(self):
         global StackString  # see [local resolution in python]
         StackString = torch.classes._TorchScriptTesting._StackString
@@ -143,6 +169,23 @@ def foo():
         scripted = torch.jit.script(foo)
         self.assertEqual(scripted(), "mom")
 
+    def test_torchbind_class_attr_recursive(self):
+        class FooBar(torch.nn.Module):
+            def __init__(self, foo_model):
+                super(FooBar, self).__init__()
+                self.foo_mod = foo_model
+
+            def forward(self) -> int:
+                return self.foo_mod.info()
+
+            def to_ivalue(self):
+                torchbind_model = torch.classes._TorchScriptTesting._Foo(self.foo_mod.info(), 1)
+                return FooBar(torchbind_model)
+
+        inst = FooBar(torch.classes._TorchScriptTesting._Foo(2, 3))
+        scripted = torch.jit.script(inst.to_ivalue())
+        self.assertEqual(scripted(), 6)
+
     def test_torchbind_class_attribute(self):
         class FooBar1234(torch.nn.Module):
             def __init__(self):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 34ebc70218b5..64bf20742d15 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2498,6 +2498,14 @@ def test_logsoftmax_dim(self):
             input = torch.randn(3, 4, 5, 6)
             self.run_test(model, input)
 
+    def test_logsoftmax_dtype(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.log_softmax(x, dim=1, dtype=torch.float64)
+
+        x = torch.randn(3, 4, 5, requires_grad=True)
+        self.run_test(Model(), x)
+
     @skipIfUnsupportedMinOpsetVersion(9)
     @disableScriptTest()  # scripting prim_dtype
     def test_lstm_no_hidden(self):
@@ -3003,12 +3011,17 @@ def forward(self, input, other):
         y = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, y))
 
-    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm(self):
+        # addmm for 3-d inputs converts to onnx::MatMul
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1)
         x = torch.randn(3, 4, 5, requires_grad=True)
         self.run_test(model, x)
 
+        # addmm for 2-d inputs converts to onnx::Gemm
+        model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1)
+        x = torch.randn(4, 5, requires_grad=True)
+        self.run_test(model, x)
+
         model = torch.nn.utils.weight_norm(torch.nn.Conv1d(1, 1, 3))
         x = torch.randn(1, 1, 5, requires_grad=True)
         self.run_test(model, x)
@@ -3021,12 +3034,17 @@ def test_weight_norm(self):
         x = torch.randn(3, 3, 5, requires_grad=True)
         self.run_test(model, x)
 
-    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm_nodim(self):
+        # addmm for 3-d inputs converts to onnx::MatMul
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None)
         x = torch.randn(3, 4, 5, requires_grad=True)
         self.run_test(model, x)
 
+        # addmm for 2-d inputs converts to onnx::Gemm
+        model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None)
+        x = torch.randn(4, 5, requires_grad=True)
+        self.run_test(model, x)
+
     def test_flatten(self):
         class FlattenModel(torch.nn.Module):
             def forward(self, input):
@@ -3382,7 +3400,9 @@ def forward(self, x):
     def test_eye(self):
         class TensorFactory(torch.nn.Module):
             def forward(self, x):
-                return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), torch.eye(x.size()[1], 2, dtype=torch.long)
+                return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), \
+                    torch.eye(x.size()[1], 2, dtype=torch.long), torch.eye(x.shape[0]), \
+                    torch.eye(x.shape[0], dtype=torch.float64)
 
         x = torch.randn(2, 3, 4)
         another_x = torch.randn(5, 6, 7)
@@ -3578,6 +3598,7 @@ def forward(self, x):
         self.run_test(MaskedSelectModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # dtype not available
     def test_index_put_to_masked_fill(self):
         class MaskedFillModel(torch.nn.Module):
             def forward(self, input_mask, some_const):
@@ -3591,6 +3612,7 @@ def forward(self, input_mask, some_const):
         self.run_test(MaskedFillModel(), (mask, constant))
 
     @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # dtype not available
     def test_index_put_to_masked_scatter(self):
         class MaskedScatterModel(torch.nn.Module):
             def forward(self, input_mask, some_const):
@@ -3658,7 +3680,6 @@ def forward(self, x):
         self.run_test(FullModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()  # dtype mismatch
     def test_full_like(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x):
@@ -3668,7 +3689,6 @@ def forward(self, x):
         self.run_test(FullLikeModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()  # dtype mismatch
     def test_full_like_value(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x, y):
@@ -4317,7 +4337,6 @@ def forward(self, input, target):
 
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()   # Output dtype mismatch
     def test_kldiv_loss(self):
 
         x = torch.randn(5)
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index e057e25643a4..16694b0f0356 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -500,48 +500,52 @@ def forward(self, x):
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
         qconfig_dict = {"": default_qconfig}
-        prepare_custom_config_dict = {"standalone_module_name": ["standalone"]}
-        # check prepared model
-        m = prepare_fx(
-            original_m, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict)
-        # calibration
-        m(data)
-        # input and output of first conv, observer for standalone module
-        # will be inserted in the standalone module itself
-        count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 2
-        }
-        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-        # for output of conv in the standalone module
-        count_check = {
-            ns.call_module(torch.quantization.MinMaxObserver): 1
-        }
-        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+        config_name = {"standalone_module_name": ["standalone"]}
+        config_class = {"standalone_module_class": [StandaloneModule]}
+        for prepare_config in [config_name, config_class]:
+            original_m_copy = copy.deepcopy(original_m)
+            original_ref_m_copy = copy.deepcopy(original_ref_m)
+            # check prepared model
+            m = prepare_fx(
+                original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config)
+            # calibration
+            m(data)
+            # input and output of first conv, observer for standalone module
+            # will be inserted in the standalone module itself
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 2
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            # for output of conv in the standalone module
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 1
+            }
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
 
-        # check converted/quantized model
-        m = convert_fx(m)
-        count_check = {
-            ns.call_function(torch.quantize_per_tensor) : 1,
-            ns.call_module(nnq.Conv2d) : 1,
-            ns.call_method('dequantize') : 1,
-        }
-        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-        count_check = {
-            # quantization of input happens in parent module
-            # quantization of output happens in the quantized conv module
-            ns.call_function(torch.quantize_per_tensor) : 0,
-            # dequantization for output happens in parent module
-            ns.call_method('dequantize') : 0,
-        }
-        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
-        res = m(data)
-
-        # quantize the reference model
-        ref_m = prepare_fx(original_ref_m, qconfig_dict)
-        ref_m(data)
-        ref_m = convert_fx(ref_m)
-        ref_res = ref_m(data)
-        self.assertEqual(res, ref_res)
+            # check converted/quantized model
+            m = convert_fx(m)
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d) : 1,
+                ns.call_method('dequantize') : 1,
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            count_check = {
+                # quantization of input happens in parent module
+                # quantization of output happens in the quantized conv module
+                ns.call_function(torch.quantize_per_tensor) : 0,
+                # dequantization for output happens in parent module
+                ns.call_method('dequantize') : 0,
+            }
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            res = m(data)
+
+            # quantize the reference model
+            ref_m = prepare_fx(original_ref_m_copy, qconfig_dict)
+            ref_m(data)
+            ref_m = convert_fx(ref_m)
+            ref_res = ref_m(data)
+            self.assertEqual(res, ref_res)
 
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index cd722d59d2a2..22751697cd1d 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -482,6 +482,29 @@ def test_save_load_state_dict_script(self):
             # Verify that state_dict matches exactly with original one.
             self.assertEqual(scripted.state_dict(), scripted_2.state_dict())
 
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_observer_qparams_respects_device_affinity(self):
+        """
+        Ensure that the scale and zero_point returned by the observer
+        are on the same device as the input tensor.
+        """
+        observerList = [MinMaxObserver(),
+                        MovingAverageMinMaxObserver(),
+                        PerChannelMinMaxObserver(),
+                        MovingAveragePerChannelMinMaxObserver()]
+        for obs in observerList:
+            device = torch.device('cuda:1')
+            x = torch.randn(1, 2, device=device)
+            obs.to(device)
+            result = obs(x)
+            scale, zero_point = obs.calculate_qparams()
+
+            self.assertEqual(x.device, scale.device)
+            self.assertEqual(x.device, zero_point.device)
+
+
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
     def __init__(self, *args, **kwargs):
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e651bfe477dd..177a9b4c7805 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3271,7 +3271,7 @@ def test_profiler_aggregation_lstm(self):
         print(prof.key_averages(group_by_input_shape=True).table(
             sort_by="self_cpu_time_total", row_limit=10))
         print(prof.table(
-            sort_by="self_cpu_time_total", row_limit=10, header="TEST", top_level_events_only=True))
+            sort_by="self_cpu_time_total", row_limit=10, max_src_column_width=300, header="TEST", top_level_events_only=True))
         print(prof.key_averages(group_by_input_shape=True).table(
             sort_by="self_cpu_time_total", row_limit=10, top_level_events_only=True))
 
diff --git a/test/test_foreach.py b/test/test_foreach.py
index e164fd352648..a723efa04684 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -28,35 +28,6 @@ class TestForeach(TestCase):
         torch.div,
     ]
 
-    unary_ops = [
-        # foreach_op, foreach_op_, torch_op, bf16, complex64/128
-        (torch._foreach_sqrt, torch._foreach_sqrt_, torch.sqrt, True , True),
-        (torch._foreach_exp, torch._foreach_exp_, torch.exp, True, True),
-        (torch._foreach_acos, torch._foreach_acos_, torch.acos, False, True),
-        (torch._foreach_asin, torch._foreach_asin_, torch.asin, False, True),
-        (torch._foreach_atan, torch._foreach_atan_, torch.atan, False, True),
-        (torch._foreach_cos, torch._foreach_cos_, torch.cos, True, True),
-        (torch._foreach_cosh, torch._foreach_cosh_, torch.cosh, False, True),
-        (torch._foreach_log, torch._foreach_log_, torch.log, True, True),
-        (torch._foreach_log10, torch._foreach_log10_, torch.log10, True, True),
-        (torch._foreach_log2, torch._foreach_log2_, torch.log2, True, True),
-        (torch._foreach_neg, torch._foreach_neg_, torch.neg, True, True),
-        (torch._foreach_tan, torch._foreach_tan_, torch.tan, False, True),
-        (torch._foreach_tanh, torch._foreach_tanh_, torch.tanh, True, True),
-        (torch._foreach_sin, torch._foreach_sin_, torch.sin, False, True),
-        (torch._foreach_sinh, torch._foreach_sinh_, torch.sinh, False, True),
-        (torch._foreach_ceil, torch._foreach_ceil_, torch.ceil, False, False),
-        (torch._foreach_erf, torch._foreach_erf_, torch.erf, True, False),
-        (torch._foreach_erfc, torch._foreach_erfc_, torch.erfc, False, False),
-        (torch._foreach_expm1, torch._foreach_expm1_, torch.expm1, False, False),
-        (torch._foreach_floor, torch._foreach_floor_, torch.floor, False, False),
-        (torch._foreach_log1p, torch._foreach_log1p_, torch.log1p, True, False),
-        (torch._foreach_round, torch._foreach_round_, torch.round, False, False),
-
-        # See test_abs
-        # (torch._foreach_abs, torch._foreach_abs_, torch.abs, True, True),
-    ]
-
     def _get_test_data(self, device, dtype, N):
         if dtype in [torch.bfloat16, torch.bool, torch.float16]:
             tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)]
@@ -85,6 +56,21 @@ def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
             else:
                 self.assertEqual(tensors1, expected)
 
+    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in N_values:
+            tensors1 = self._get_test_data(device, dtype, N)
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
+            res = foreach_op(tensors1)
+            foreach_op_(tensors1)
+            self.assertEqual(res, tensors1)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
+
     def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
         for N in N_values:
             values = [2 + i for i in range(N)]
@@ -163,106 +149,13 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_
     #
     # Unary ops
     #
-    @dtypes(*(torch.testing.floating_and_complex_types_and(torch.bfloat16, torch.half)))
-    def test_unary_ops(self, device, dtype):
-        for fe_op, fe_op_, torch_op, support_bfloat16, support_complex in self.unary_ops:
-            for N in N_values:
-                tensors1 = self._get_test_data(device, dtype, N)
-                # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
-                control_dtype = torch.float32 if (self.device_type == 'cuda' and
-                                                  (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
-
-                if self.device_type == 'cpu' and dtype == torch.half and torch_op != torch.neg:
-                    with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"):
-                        expected = [torch_op(tensors1[i]) for i in range(N)]
-
-                    with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"):
-                        res = fe_op(tensors1)
-                    break
-
-                if dtype == torch.bfloat16 and not support_bfloat16:
-                    if self.device_type == 'cuda' or torch_op in [torch.sinh, torch.cosh]:
-                        with self.assertRaisesRegex(RuntimeError, r"not implemented for \'BFloat16\'"):
-                            expected = [torch_op(tensors1[i]) for i in range(N)]
-
-                        with self.assertRaisesRegex(RuntimeError, r"not implemented for \'BFloat16\'"):
-                            res = fe_op(tensors1)
-                        break
-
-                if dtype in [torch.complex64, torch.complex128] and not support_complex:
-                    # not using assertRaisesRegex due to different error messages
-                    with self.assertRaises(RuntimeError):
-                        expected = [torch_op(tensors1[i]) for i in range(N)]
-
-                    with self.assertRaises(RuntimeError):
-                        res = fe_op(tensors1)
-                    break
-
-                expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
-                res = fe_op(tensors1)
-                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
-                    self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+    @dtypes(*[torch.float, torch.double, torch.complex64, torch.complex128])
+    def test_sqrt(self, device, dtype):
+        self._test_unary_op(device, dtype, torch._foreach_sqrt, torch._foreach_sqrt_, torch.sqrt)
 
-                    fe_op_(tensors1)
-                    self.assertEqual(res, tensors1)
-                else:
-                    self.assertEqual(res, expected)
-
-                    fe_op_(tensors1)
-                    self.assertEqual(res, tensors1)
-
-    # Separate test for abs due to a lot of special cases
-    # Absolute value of a complex number a + bj is defined as sqrt(a^2 + b^2), i.e. a floating point
-    @dtypes(*(torch.testing.floating_and_complex_types_and(torch.bfloat16, torch.half)))
-    def test_abs(self, device, dtype):
-        for N in N_values:
-            tensors1 = self._get_test_data(device, dtype, N)
-            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
-            control_dtype = torch.float32 if (self.device_type == 'cuda' and
-                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
-
-            expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
-            res = torch._foreach_abs(tensors1)
-            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
-                self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
-
-                torch._foreach_abs_(tensors1)
-                self.assertEqual(res, tensors1)
-            else:
-                if self.device_type == 'cpu':
-                    if dtype == torch.complex64:
-                        expected = [torch.abs(tensors1[i].to(dtype=control_dtype)) for i in range(N)]
-                        self.assertEqual(res, expected)
-
-                        with self.assertRaisesRegex(RuntimeError, r"In-place abs is not supported for complex tensors."):
-                            torch._foreach_abs_(tensors1)
-                        break
-                    elif dtype == torch.complex128:
-                        expected = [torch.abs(tensors1[i].to(dtype=control_dtype)) for i in range(N)]
-                        self.assertEqual(res, expected)
-
-                        with self.assertRaisesRegex(RuntimeError, r"In-place abs is not supported for complex tensors."):
-                            torch._foreach_abs_(tensors1)
-                        break
-                    else:
-                        self.assertEqual(res, expected)
-                else:
-                    if dtype == torch.complex64:
-                        expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(torch.complex64) for i in range(N)]
-                        self.assertEqual(res, expected)
-
-                        torch._foreach_abs_(tensors1)
-                        self.assertEqual(res, tensors1)
-                        break
-                    elif dtype == torch.complex128:
-                        expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(torch.complex128) for i in range(N)]
-                        self.assertEqual(res, expected)
-
-                        torch._foreach_abs_(tensors1)
-                        self.assertEqual(res, tensors1)
-                        break
-                    else:
-                        self.assertEqual(res, expected)
+    @dtypes(*[torch.float, torch.double, torch.complex64, torch.complex128])
+    def test_exp(self, device, dtype):
+        self._test_unary_op(device, dtype, torch._foreach_exp, torch._foreach_exp_, torch.exp)
 
     #
     # Pointwise ops
diff --git a/test/test_fx.py b/test/test_fx.py
index 349941c72f86..dcb104528402 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1104,6 +1104,54 @@ def forward(self, x):
         traced = torch.fx.symbolic_trace(Foo())
         assert(all('constant' not in node.target for node in traced.graph.nodes))
 
+    def test_single_default_arg(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, y=1):
+                return y
+
+        m = M()
+        self.checkGraphModule(m, ())
+        self.checkGraphModule(m, (3,))
+
+    def test_multiple_default_args(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, y=1, z=2):
+                return y + z
+
+        m = M()
+        self.checkGraphModule(m, ())
+        self.checkGraphModule(m, (3,))
+        self.checkGraphModule(m, (3, 4))
+
+    def test_regular_and_default_args(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y=1):
+                return x + y
+
+        m = M()
+        self.checkGraphModule(m, (2,))
+        self.checkGraphModule(m, (2, 3))
+
+    def test_string_literal_return(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self):
+                return "foo"
+
+        m = M()
+        self.checkGraphModule(m, ())
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index ab387a12a6ea..9a75663e4205 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -1,36 +1,121 @@
 import torch
+from typing import Dict
 from torch.fx.symbolic_trace import symbolic_trace
 from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental.Partitioner import Partitioner, Device, PartitionerConfig
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.fx.experimental.partitioner_utils import NodeLatency, \
-    get_partition_to_latency_mapping, get_latency_of_partitioned_graph
+from torch.fx.experimental.partitioner_utils import (
+    NodeLatency,
+    get_partition_to_latency_mapping,
+    get_latency_of_partitioned_graph,
+)
 from typing import Union, Callable
 
+
 def symbolic_trace_with_rewrite(root: Union[torch.nn.Module, Callable]) -> GraphModule:
-    return GraphModule(root if isinstance(root, torch.nn.Module) else torch.nn.Module(), RewritingTracer().trace(root))
+    return GraphModule(
+        root if isinstance(root, torch.nn.Module) else torch.nn.Module(),
+        RewritingTracer().trace(root),
+    )
+
 
 class TestFXExperimental(JitTestCase):
+    def test_serialize_graph(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.e = torch.rand(4)
+
+            def forward(self, a, b):
+                add_1 = a + b
+                linear = self.linear(add_1)
+                add_2 = linear + self.e
+                return add_2
+
+        m = TestModule()
+        traced = symbolic_trace(m)
+        a = torch.rand(4)
+        b = torch.rand(4)
+        GraphManipulation.get_size_of_all_nodes(traced, [a, b])
+
+        partitioner = Partitioner()
+        devices = [Device("dev_0", 5000, 0), Device("dev_1", 125, 1)]
+        partitioner_config = PartitionerConfig(devices, is_sparse_nn=True)
+        ret = partitioner.partition_graph(traced, m, partitioner_config)
+        module_with_submodules = ret.module_with_submodules
+        # Fix for now to add type/shape to output
+        for node in traced.graph.nodes:
+            if node.op == "output":
+                node.shape = a.shape
+                node.dtype = a.dtype
+        for mod in module_with_submodules.modules():
+            if isinstance(mod, GraphModule):
+                for node in mod.graph.nodes:
+                    node.shape = a.shape
+                    node.dtype = a.dtype
+        for node in module_with_submodules.graph.nodes:
+            node.shape = a.shape
+            node.dtype = a.dtype
+
+        agm1 = GraphManipulation.AcceleratedGraphModule(traced)
+        agm2 = GraphManipulation.AcceleratedGraphModule(module_with_submodules)
+        assert len(agm1.weights) == 3
+        assert len(agm2.weights) == 3
+        assert len(agm1.serialized_graph["nodes"]) == 7
+        assert len(agm1.serialized_graph["weights"]) == 3
+        assert len(agm1.serialized_graph["modules"]) == 0
+        assert len(agm2.serialized_graph["nodes"]) == 5
+        assert len(agm2.serialized_graph["weights"]) == 3
+        assert len(agm2.serialized_graph["modules"]) == 1
+        assert agm1.serialized_graph["weights"]["linear.weight"]["shape"] == "[4, 4]"
+        assert (
+            agm1.serialized_graph["weights"]["linear.weight"]["dtype"]
+            == "torch.float32"
+        )
+        assert (
+            agm1.serialized_graph["weights"]["linear.weight"]["is_quantized"] is False
+        )
+        assert agm1.serialized_graph["nodes"][0]["shape"] == "[4]"
+        assert agm1.serialized_graph["nodes"][0]["dtype"] == "torch.float32"
+        assert agm1.serialized_graph["nodes"][0]["target"] == "a"
+        assert agm1.serialized_graph["nodes"][0]["op_code"] == "placeholder"
+        assert agm1.serialized_graph["nodes"][0]["name"] == "a"
+        assert agm1.serialized_graph["nodes"][2]["args"][0]["name"] == "a"
+        assert agm1.serialized_graph["nodes"][2]["args"][0]["is_node"] is True
+
+        # Test quantization info serialization.
+        x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
+        q_tensor = torch.quantize_per_tensor(x, 1, 0, torch.qint32)
+        q_tensor_channel = torch.quantize_per_channel(
+            x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8
+        )
+        result = GraphManipulation.serialize_tensor_quantization(q_tensor)
+        result2 = GraphManipulation.serialize_tensor_quantization(q_tensor_channel)
+        assert result["q_scheme"] == "torch.per_tensor_affine"
+        assert result["q_scale"] == 1.0
+        assert result2["q_scheme"] == "torch.per_channel_affine"
+        assert len(result2["q_per_channel_scales"]) == 2
+
     def test_find_single_partition(self):
         class TestModule(torch.nn.Module):
             def forward(self, a, b):
                 return a + b
+
         m = TestModule()
         traced = symbolic_trace(m)
         a = torch.rand(1)
         b = torch.rand(1)
-        GraphManipulation.get_size_of_all_nodes(
-            traced,
-            [a, b]
-        )
+        GraphManipulation.get_size_of_all_nodes(traced, [a, b])
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125, 0),
-            Device('dev_1', 125, 1),
-            Device('dev_2', 125, 2)
+            Device("dev_0", 125, 0),
+            Device("dev_1", 125, 1),
+            Device("dev_2", 125, 2),
         ]
         partitioner_config = PartitionerConfig(devices)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
@@ -56,15 +141,12 @@ def forward(self, a, b):
         traced = symbolic_trace(m)
         a = torch.rand(4)
         b = torch.rand(4)
-        GraphManipulation.get_size_of_all_nodes(
-            traced,
-            [a, b]
-        )
+        GraphManipulation.get_size_of_all_nodes(traced, [a, b])
         partitioner = Partitioner()
         devices = [
-            Device('dev_0', 125, 0),
-            Device('dev_1', 125, 1),
-            Device('dev_2', 125, 2)
+            Device("dev_0", 125, 0),
+            Device("dev_1", 125, 1),
+            Device("dev_2", 125, 2),
         ]
         partitioner_config = PartitionerConfig(devices)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
@@ -91,15 +173,9 @@ def forward(self, a):
         m = TestModule()
         traced = symbolic_trace(m)
         a = torch.rand(4)
-        GraphManipulation.get_size_of_all_nodes(
-            traced,
-            [a]
-        )
+        GraphManipulation.get_size_of_all_nodes(traced, [a])
         partitioner = Partitioner()
-        devices = [
-            Device('dev_0', 120, 0),
-            Device('dev_1', 160, 1)
-        ]
+        devices = [Device("dev_0", 120, 0), Device("dev_1", 160, 1)]
         partitioner_config = PartitionerConfig(devices, is_sparse_nn=False)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
@@ -128,12 +204,12 @@ def __init__(self):
                 layers = self.create_mlp(3, 24, 24)
                 self.top_layers = torch.nn.Sequential(*layers)
                 self.embedding_layers = torch.nn.ModuleList()
-                el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True)
+                el = torch.nn.EmbeddingBag(500000, 4, mode="sum", sparse=True)
                 self.embedding_layers.append(el)
                 for i in range(3):
-                    el = torch.nn.EmbeddingBag(1000000, 4, mode='sum', sparse=True)
+                    el = torch.nn.EmbeddingBag(1000000, 4, mode="sum", sparse=True)
                     self.embedding_layers.append(el)
-                el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True)
+                el = torch.nn.EmbeddingBag(500000, 4, mode="sum", sparse=True)
                 self.embedding_layers.append(el)
 
             def forward(self, a, b, offset):
@@ -141,27 +217,29 @@ def forward(self, a, b, offset):
                 y = []
                 c = []
                 for i in range(len(self.embedding_layers)):
-                    temp = torch.randint(10, (8, ))
+                    temp = torch.randint(10, (8,))
                     c.append(temp + b)
                 for i in range(len(self.embedding_layers)):
                     if i % 2 == 0:
                         y.append(self.embedding_layers[i](c[i], offset))
                     else:
-                        y.append(self.embedding_layers[i](torch.randint(10, (8, )), offset))
+                        y.append(
+                            self.embedding_layers[i](torch.randint(10, (8,)), offset)
+                        )
                 z = torch.cat([x] + y, dim=1)
                 p = self.top_layers(z)
                 return p
 
         m = MyRecommendationModule()
         a = torch.rand(2, 4)
-        b = torch.randint(10, (8, ))
-        offset = torch.randint(1, (2, ))
+        b = torch.randint(10, (8,))
+        offset = torch.randint(1, (2,))
         traced = symbolic_trace(m)
         GraphManipulation.get_size_of_all_nodes(traced, [a, b, offset])
         devices = [
-            Device('dev_0', 33000000, 0),
-            Device('dev_1', 33000000, 1),
-            Device('dev_2', 33000000, 2)
+            Device("dev_0", 33000000, 0),
+            Device("dev_1", 33000000, 1),
+            Device("dev_2", 33000000, 2),
         ]
         partitioner_config = PartitionerConfig(devices, is_sparse_nn=True)
         partitioner = Partitioner()
@@ -187,15 +265,19 @@ def forward(self, a):
 
         def get_node_to_latency_mapping(fx_module: GraphModule):
             """Given a fx module, generate node latency for each node
-               based on the size of each node
+            based on the size of each node
             """
             node_to_latency_mapping: Dict[Node, NodeLatency] = {}
             for node in fx_module.graph.nodes:
-                if node.op not in {'output', 'placeholder', 'get_attr'}:
+                if node.op not in {"output", "placeholder", "get_attr"}:
                     if node.size_bytes.total_size == node.size_bytes.output_size:
-                        node_to_latency_mapping[node] = NodeLatency(node.size_bytes.total_size, 2. * node.size_bytes.total_size)
+                        node_to_latency_mapping[node] = NodeLatency(
+                            node.size_bytes.total_size, 2.0 * node.size_bytes.total_size
+                        )
                     else:
-                        node_to_latency_mapping[node] = NodeLatency(node.size_bytes.total_size, node.size_bytes.output_size)
+                        node_to_latency_mapping[node] = NodeLatency(
+                            node.size_bytes.total_size, node.size_bytes.output_size
+                        )
             return node_to_latency_mapping
 
         m = TestModule()
@@ -203,36 +285,33 @@ def get_node_to_latency_mapping(fx_module: GraphModule):
         a = torch.rand(4)
         GraphManipulation.get_size_of_all_nodes(traced, [a])
         node_to_latency_mapping = get_node_to_latency_mapping(traced)
-        devices = [
-            Device('dev_0', 200, 0),
-            Device('dev_1', 200, 1)
-        ]
+        devices = [Device("dev_0", 200, 0), Device("dev_1", 200, 1)]
         partitioner = Partitioner()
         partitioner_config = PartitionerConfig(devices, False)
         ret = partitioner.partition_graph(traced, m, partitioner_config)
         module_with_submodules = ret.module_with_submodules
         self.assertEqual(traced(a), module_with_submodules(a))
         partitions = partitioner.partitions
-        partition_to_latency_mapping = get_partition_to_latency_mapping(partitions, node_to_latency_mapping)
+        partition_to_latency_mapping = get_partition_to_latency_mapping(
+            partitions, node_to_latency_mapping
+        )
         for p in partition_to_latency_mapping:
             if p.partition_id == 0:
-                assert partition_to_latency_mapping[p] == (128., 80., 160.)
+                assert partition_to_latency_mapping[p] == (128.0, 80.0, 160.0)
             else:
-                assert partition_to_latency_mapping[p] == (16., 32., 32.)
+                assert partition_to_latency_mapping[p] == (16.0, 32.0, 32.0)
         transfer_rate_bytes_per_sec = 0.5
         critical_path_latency_sec = get_latency_of_partitioned_graph(
-            partitions,
-            partition_to_latency_mapping,
-            transfer_rate_bytes_per_sec
+            partitions, partition_to_latency_mapping, transfer_rate_bytes_per_sec
         )
-        assert critical_path_latency_sec == 208.
+        assert critical_path_latency_sec == 208.0
 
     def test_call_to_assert_no_msg(self):
-
         class M(torch.nn.Module):
             def forward(self, a, b):
                 assert a == b
                 return a + b
+
         m = M()
         traced = symbolic_trace_with_rewrite(m)
 
@@ -240,7 +319,12 @@ def forward(self, a, b):
         traced.graph.lint(traced)
 
         # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+        self.assertTrue(
+            any(
+                node.op == "call_function" and node.target == torch.Assert
+                for node in traced.graph.nodes
+            )
+        )
 
         # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
         traced(3, 3)
@@ -251,11 +335,11 @@ def forward(self, a, b):
         self.assertEqual(traced(3, 3), m(3, 3))
 
     def test_call_to_assert_with_msg(self):
-
         class M(torch.nn.Module):
             def forward(self, a, b):
                 assert a == b, "test message"
                 return a + b
+
         m = M()
         traced = symbolic_trace_with_rewrite(m)
 
@@ -263,7 +347,12 @@ def forward(self, a, b):
         traced.graph.lint(traced)
 
         # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+        self.assertTrue(
+            any(
+                node.op == "call_function" and node.target == torch.Assert
+                for node in traced.graph.nodes
+            )
+        )
 
         # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
         traced(3, 3)
@@ -274,11 +363,11 @@ def forward(self, a, b):
         self.assertEqual(traced(3, 3), m(3, 3))
 
     def test_call_to_assert_with_empty_msg(self):
-
         class M(torch.nn.Module):
             def forward(self, a, b):
                 assert a == b, ""
                 return a + b
+
         m = M()
         traced = symbolic_trace_with_rewrite(m)
 
@@ -286,7 +375,12 @@ def forward(self, a, b):
         traced.graph.lint(traced)
 
         # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+        self.assertTrue(
+            any(
+                node.op == "call_function" and node.target == torch.Assert
+                for node in traced.graph.nodes
+            )
+        )
 
         # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
         traced(3, 3)
@@ -297,7 +391,6 @@ def forward(self, a, b):
         self.assertEqual(traced(3, 3), m(3, 3))
 
     def test_call_to_assert_with_multiline_message(self):
-
         class M(torch.nn.Module):
             def forward(self, a, b):
                 error_msg = """
@@ -306,6 +399,7 @@ def forward(self, a, b):
                 """
                 assert a == b, error_msg
                 return a + b
+
         m = M()
         traced = symbolic_trace_with_rewrite(m)
 
@@ -313,7 +407,12 @@ def forward(self, a, b):
         traced.graph.lint(traced)
 
         # Check the IR to make sure there's a call_function node with target == "Assert"
-        self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes))
+        self.assertTrue(
+            any(
+                node.op == "call_function" and node.target == torch.Assert
+                for node in traced.graph.nodes
+            )
+        )
 
         # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to
         error_msg = """
@@ -330,7 +429,9 @@ def forward(self, a, b):
     def test_traceable_function_with_nonstandard_name(self):
         def foo(x):
             return torch.relu(x)
+
         traced = symbolic_trace_with_rewrite(foo)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 53ba84e6348a..cbab1bde6963 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1208,6 +1208,145 @@ def test_dot_invalid_args(self, device):
         self._test_dot_vdot_invalid_args(device, torch.dot)
         self._test_dot_vdot_invalid_args(device, torch.dot, complex_dtypes=True)
 
+    def test_einsum(self, device):
+        def check(equation, *operands):
+            ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands])
+            res = torch.einsum(equation, operands)
+            self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref)))
+
+            # Autograd check (FIXME: tests below fail check)
+            if equation not in {"i,i->", "i,i->i", "ij,ij->ij"}:
+                ops = [op.detach().requires_grad_() for op in operands]
+                self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops))
+                for op in ops:
+                    self.assertTrue(op._version == 0)
+
+        # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+        x = torch.rand(5, device=device)
+        y = torch.rand(7, device=device)
+        A = torch.randn(3, 5, device=device)
+        B = torch.randn(2, 5, device=device)
+        C = torch.randn(2, 3, 5, device=device)
+        D = torch.randn(2, 5, 7, device=device)
+        E = torch.randn(7, 9, device=device)
+        F = torch.randn(2, 3, 3, 5, device=device)
+        G = torch.randn(5, 4, 6, device=device)
+        H = torch.randn(4, 4, device=device)
+        I = torch.rand(2, 3, 2, device=device)
+
+        # Vector operations
+        check('i->', x)                     # sum
+        check('i,i->', x, x)                # dot
+        check('i,i->i', x, x)               # vector element-wisem mul
+        check('i,j->ij', x, y)              # outer
+
+        # Matrix operations
+        check("ij->ji", A)                  # transpose
+        check("ij->j", A)                   # row sum
+        check("ij->i", A)                   # col sum
+        check("ij,ij->ij", A, A)            # matrix element-wise mul
+        check("ij,j->i", A, x)              # matrix vector multiplication
+        check("ij,kj->ik", A, B)            # matmul
+        check("ij,ab->ijab", A, E)          # matrix outer product
+
+        # Tensor operations
+        check("aij,ajk->aik", C, D)         # batch matmul
+        check("ijk,jk->i", C, A)            # tensor matrix contraction
+        check("aij,jk->aik", D, E)          # tensor matrix contraction
+        check("abcd,dfg->abcfg", F, G)      # tensor tensor contraction
+        check("ijk,jk->ik", C, A)           # tensor matrix contraction with double indices
+        check("ijk,jk->ij", C, A)           # tensor matrix contraction with double indices
+        check("ijk,ik->j", C, B)            # non contiguous
+        check("ijk,ik->jk", C, B)           # non contiguous with double indices
+
+        # Test diagonals
+        check("ii", H)                      # trace
+        check("ii->i", H)                   # diagonal
+        check('iji->j', I)                  # non-contiguous trace
+
+        # Test ellipsis
+        check("i...->...", H)
+        check("ki,...k->i...", A.t(), B)
+        check("k...,jk", A.t(), B)
+        check('...ik, ...kj -> ...ij', torch.rand(2, 3, 4), torch.rand(1, 5))
+        check('bik,k...j->i...j', torch.rand(5, 2, 3), torch.rand(3, 2))
+        check('i...j, ij... -> ...ij', torch.rand(2, 3, 4), torch.rand(2, 4, 2, 3))
+
+        # torch.bilinear with discontiguous tensors
+        l = torch.randn(10, 5, device=device).transpose(0, 1)
+        r = torch.randn(20, 5, device=device).transpose(0, 1)
+        w = torch.randn(15, 10, 20, device=device)
+        check("bn,anm,bm->ba", l, w, r)
+        # with strided tensors
+        check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2])
+
+    def test_einsum_corner_cases(self, device):
+        def check(equation, *operands, expected_output):
+            tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple)
+                       else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands]
+            output = torch.einsum(equation, tensors)
+            self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device))
+
+        # Test equation variantions
+        check(' ', 1, expected_output=1)
+        check(' -> ', 1, expected_output=1)
+        check(' , ', 2, 2, expected_output=4)
+        check(' , , ', 2, 2, 2, expected_output=8)
+        check(' , -> ', 2, 2, expected_output=4)
+        check(' i ', [1], expected_output=[1])
+        check(' i -> ', [1], expected_output=1)
+        check(' i -> i ', [1], expected_output=[1])
+        check(' i , i ', [2], [2], expected_output=4)
+        check(' i , i -> i ', [2], [2], expected_output=[4])
+
+        # Test tensors with 0 size dimensions
+        check('i', [], expected_output=[])
+        check(' i j -> j', [[], []], expected_output=[])
+        check('ij->i', [[], []], expected_output=[0., 0.])
+        check(' i j k  ,  k  -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []])
+
+        # Test broadcasting
+        check('i,j', [2], [1, 2], expected_output=[[2, 4]])
+        check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]])
+
+        # Test ellipsis broadcasting
+        check('...', 1, expected_output=1)
+        check('...->', 1, expected_output=1)
+        check('...->...', 1, expected_output=1)
+        check('i...->i', [1], expected_output=[1])
+        check('i...->...i', [1], expected_output=[1])
+
+    def test_einsum_error_cases(self, device):
+        def check(equation, operands, regex, exception=RuntimeError):
+            with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex):
+                torch.einsum(equation, operands)
+
+        x = torch.rand(2)
+        y = torch.rand(2, 3)
+
+        check('', [], r'must provide at least one operand')
+        check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis')
+        check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found')
+        check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0')
+        check(',', [x], r'fewer operands were provided than specified in the equation')
+        check('', [x, x], r'more operands were provided than specified in the equation')
+        check('', [x], r'the number of subscripts in the equation \(0\) does not match the number '
+                       r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number '
+                         r'of dimensions \(1\) for operand 0 and no ellipsis was given')
+        check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number '
+                            r'of dimensions \(1\) for operand 0')
+        check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found')
+        check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)')
+        check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output')
+        check('a->aa', [x], r'output subscript a appears more than once in the output')
+        check('a->i', [x], r'output subscript i does not appear in the equation for any input operand')
+        check('...->', [x], r'ellipsis \(...\) covering one or more dimensions was given in the input '
+                            r'but not in the output')
+        check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2')
+        check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: '
+                               r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]')
+
 instantiate_device_type_tests(TestLinalg, globals())
 
 if __name__ == '__main__':
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 4734b3bc7c91..f12d9ace9cbd 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -821,6 +821,22 @@ def test_gradcheck(self):
             torch.add,
         })
 
+class TestNamedTuple(TestCase):
+    "Regression test for gh-47090"
+    def test_max(self):
+        x = torch.tensor([1, 2])
+        xs = x.as_subclass(SubTensor2)
+        r = torch.max(x, dim=0)
+        rs = torch.max(xs, dim=0)
+        self.assertEqual(type(r), type(rs))
+        self.assertEqual(r, rs)
+
+class TestGradNewOnesOverride(TestCase):
+    """ Regression test for gh-47069 """
+    def test_newones(self):
+        t = torch.tensor([1, 2]).as_subclass(SubTensor2)
+        n = t.new_ones((1, 2))
+        self.assertEqual(type(n), SubTensor2)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_torch.py b/test/test_torch.py
index ba9492c500f4..fce680b2b7af 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1,6 +1,5 @@
 import sys
 import io
-import gc
 import inspect
 import itertools
 import math
@@ -40,7 +39,7 @@
     onlyCUDA, onlyCPU, \
     dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, skipCUDAIf, precisionOverride, \
     PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyOnCPUAndCUDA, expectedAlertNondeterministic
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List
 import torch.backends.quantized
 import torch.testing._internal.data
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off
@@ -2951,20 +2950,34 @@ def test_parsing_intlist(self):
                                    lambda: torch.tensor().new_zeros((5, 5), 0))
 
         def test_half_tensor(self):
-            x = torch.randn(5, 5).float()
-            y = torch.randn(5, 5).float()
-            xh, yh = x.half(), y.half()
+            devices = ["cpu"]
+            if torch.cuda.is_available():
+                devices.append("cuda")
 
-            self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0)
+            # contiguous tensor
+            # non-contiguous tensor
+            # dense non-overlapping tensor
+            # non-dense non-overlapping sliced tensor
+            # non-dense overlapping equal strides
+            for device in devices:
+                tset = (
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float).contiguous(),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float).transpose(0, 1),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float)[:, :, ::2],
+                    torch.empty_strided(
+                        (4, 2, 3), (10, 3, 3), device=device, dtype=torch.float
+                    ).copy_(torch.rand((4, 2, 3), dtype=torch.float, device=device)),
+                )
 
-            z = torch.Tensor(5, 5)
-            self.assertEqual(z.copy_(xh), x, atol=1e-3, rtol=0)
-
-            with tempfile.NamedTemporaryFile() as f:
-                torch.save(xh, f)
-                f.seek(0)
-                xh2 = torch.load(f)
-                self.assertEqual(xh.float(), xh2.float())
+                for x in tset:
+                    self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0)
+                    xh = x.half()
+                    with tempfile.NamedTemporaryFile() as f:
+                        torch.save(xh, f)
+                        f.seek(0)
+                        xh2 = torch.load(f)
+                        self.assertEqual(xh.float(), xh2.float())
 
         def test_from_buffer(self):
             a = bytearray([1, 2, 3, 4])
@@ -7991,7 +8004,6 @@ def test_cholesky(self, device, dtype):
         B = torch.mm(L, L.t().conj())
         self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix')
 
-    @skipIfRocm  # This test has many dimensions, which is larger than the maximum dims supported by ROCm (16)
     def test_view(self, device):
         tensor = torch.rand(15, device=device)
         template = torch.rand(3, 5, device=device)
@@ -9311,6 +9323,11 @@ def test_kthvalue(self, device, dtype):
             self.assertEqual(res1val[:, :], res2val[:, :, k - 1], atol=0, rtol=0)
             self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], atol=0, rtol=0)
 
+        # Test scalar input (test case from https://github.com/pytorch/pytorch/issues/30818)
+        # Tests that passing a scalar tensor or 1D tensor with 1 element work either way
+        x = torch.tensor([2], device=device, dtype=dtype)
+        self.assertEqual(x.squeeze().kthvalue(1), x.kthvalue(1))
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@@ -9649,7 +9666,7 @@ def test_multidim(x, dim):
             expected = fn(y, 1, keepdim=False)
             self.assertEqual(x[:, 1], expected, msg='{} with out= kwarg'.format(fn_name))
 
-    @onlyCUDA
+    @slowTest
     @largeTensorTest('10GB')
     def test_reduction_split(self, device):
         # Test reduction when there is a 32bit-indexing split
@@ -9658,13 +9675,6 @@ def test_reduction_split(self, device):
         result = input_.sum(dim=0)
         expect = input_[0] + input_[1] + input_[2] + input_[3] + input_[4]
         self.assertEqual(result, expect)
-        gc.collect()
-        torch.cuda.empty_cache()
-        a = torch.randn(8, 1, 128, 1024, 1024, device=device, dtype=torch.half)
-        self.assertEqual((a.sum(1) - a.squeeze()).abs().max(), 0)
-        gc.collect()
-        torch.cuda.empty_cache()
-        self.assertEqual((a.sum(1, keepdim=True) - a).abs().max(), 0)
 
     @onlyCUDA
     @dtypes(torch.half, torch.float, torch.double)
@@ -16280,81 +16290,6 @@ def test_helper(min, max):
 
         test_helper(torch.finfo(dtype).tiny, torch.finfo(dtype).max)
 
-    @onlyCPU
-    @slowTest
-    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
-    @dtypes(torch.double)
-    def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None:
-        # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
-        x = torch.randn(5, dtype=dtype, device=device)
-        y = torch.randn(7, dtype=dtype, device=device)
-        A = torch.randn(3, 5, dtype=dtype, device=device)
-        B = torch.randn(2, 5, dtype=dtype, device=device)
-        C = torch.randn(2, 3, 5, dtype=dtype, device=device)
-        D = torch.randn(2, 5, 7, dtype=dtype, device=device)
-        E = torch.randn(7, 9, dtype=dtype, device=device)
-        F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device)
-        G = torch.randn(7, 11, 13, dtype=dtype, device=device)
-        H = torch.randn(4, 4, dtype=dtype, device=device)
-        I = torch.randn(3, 4, 4, dtype=dtype, device=device)
-        l = torch.randn(5, 10, dtype=dtype, device=device)
-        r = torch.randn(5, 20, dtype=dtype, device=device)
-        w = torch.randn(30, 10, 20, dtype=dtype, device=device)
-        test_list: List[Union[Tuple[str, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor],
-                        Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [
-            # -- Vector
-            ("i->", x),                 # sum
-            ("i,i->", x, x),            # dot
-            ("i,i->i", x, x),           # vector element-wise mul
-            ("i,j->ij", x, y),          # outer
-            # -- Matrix
-            ("ij->ji", A),              # transpose
-            ("ij->j", A),               # row sum
-            ("ij->i", A),               # col sum
-            ("ij,ij->ij", A, A),        # matrix element-wise mul
-            ("ij,j->i", A, x),          # matrix vector multiplication
-            ("ij,kj->ik", A, B),        # matmul
-            ("ij,ab->ijab", A, E),      # matrix outer product
-            # -- Tensor
-            ("aij,ajk->aik", C, D),     # batch matmul
-            ("ijk,jk->i", C, A),        # tensor matrix contraction
-            ("aij,jk->aik", D, E),      # tensor matrix contraction
-            ("abcd,dfg->abcfg", F, G),  # tensor tensor contraction
-            ("ijk,jk->ik", C, A),       # tensor matrix contraction with double indices
-            ("ijk,jk->ij", C, A),       # tensor matrix contraction with double indices
-            ("ijk,ik->j", C, B),        # non contiguous
-            ("ijk,ik->jk", C, B),       # non contiguous with double indices
-            # -- Diagonal
-            ("ii", H),                 # trace
-            ("ii->i", H),              # diagonal
-            # -- Ellipsis
-            ("i...->...", H),
-            ("ki,...k->i...", A.t(), B),
-            ("k...,jk", A.t(), B),
-            ("...ii->...i", I),       # batch diagonal
-            # -- Other
-            ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
-            ("... ii->...i  ", I),       # batch diagonal with spaces
-        ]
-        for test in test_list:
-            actual = torch.einsum(test[0], test[1:])
-            expected = np.einsum(test[0], *[t.numpy() for t in test[1:]])
-            self.assertEqual(expected.shape, actual.shape, msg=test[0])
-            self.assertEqual(expected, actual, msg=test[0])
-            # test vararg
-            actual2 = torch.einsum(test[0], *test[1:])
-            self.assertEqual(expected.shape, actual2.shape, msg=test[0])
-            self.assertEqual(expected, actual2, msg=test[0])
-
-            def do_einsum(*args):
-                return torch.einsum(test[0], args)
-            # FIXME: following test cases fail gradcheck
-            if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}:
-                gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:])
-                self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
-            self.assertTrue(A._version == 0)  # check that we do not use inplace ops
-
     @onlyCPU
     @dtypes(torch.bool, torch.double)
     def test_sum_all(self, device, dtype) -> None:
@@ -17347,8 +17282,11 @@ def _test_copysign_numpy(a, b):
             # Use double copysign to verify the correctnes of 0.0 and -0.0, since
             # it always True for self.assertEqual(0.0 == -0.0). So, we use 1 as the
             # magnitude to verify the sign between torch and numpy results, elementwise.
-            self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result),
-                             torch.copysign(torch.tensor(1.0), expected))
+            # Special case: NaN conversions between FP32 and FP16 is not bitwise
+            # equivalent to pass this assertion.
+            if a.dtype != torch.float16 and b.dtype != torch.float16:
+                self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result),
+                                 torch.copysign(torch.tensor(1.0), expected))
 
         # Compare Result with NumPy
         # Type promotion
@@ -19155,11 +19093,7 @@ def test_nansum_out_dtype(self, device):
             torch_fn = partial(torch.nansum, dtype=out_dtype)
             np_out_dtype = torch_to_numpy_dtype_dict[out_dtype]
             np_fn = partial(np.nansum, dtype=np_out_dtype)
-            if (inp_dtype, out_dtype) == (torch.uint8, torch.float16):
-                # 25504.0 vs 25536.0
-                self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None, atol=0, rtol=0.002)
-            else:
-                self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     @dtypes(torch.int32, torch.int64)
     def test_large_linspace(self, device, dtype):
@@ -19599,6 +19533,50 @@ def test_dstack(self, device, dtype):
             expected = np.dstack(np_input)
             self.assertEqual(actual, expected)
 
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False,
+                                           include_bool=True, include_complex=False)))
+    def test_all_any_vs_numpy(self, device, dtype):
+        def _test_all_any(x):
+            self.compare_with_numpy(torch.all, np.all, x)
+            self.compare_with_numpy(torch.any, np.any, x)
+
+        def _test_all_any_with_dim(x, dim):
+            torch_fn = partial(torch.all, dim=dim)
+            np_fn = partial(np.all, axis=dim)
+            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False)
+
+            torch_fn = partial(torch.any, dim=dim)
+            np_fn = partial(np.any, axis=dim)
+            self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False)
+
+        for ndim in range(5):
+            shape = self._rand_shape(ndim, 1, 5)
+            x = self._generate_input(shape, dtype, device, with_extremal=False)
+            _test_all_any(x)
+
+            x = self._generate_input(shape, dtype, device, with_extremal=True)
+            _test_all_any(x)
+
+            x = torch.zeros_like(x)
+            _test_all_any(x)
+
+            x = torch.ones_like(x)
+            _test_all_any(x)
+
+            for dim in range(ndim):
+                x = self._generate_input(shape, dtype, device, with_extremal=False)
+                _test_all_any_with_dim(x, dim)
+
+                x = self._generate_input(shape, dtype, device, with_extremal=True)
+                _test_all_any_with_dim(x, dim)
+
+                x = torch.zeros_like(x)
+                _test_all_any_with_dim(x, dim)
+
+                x = torch.ones_like(x)
+                _test_all_any_with_dim(x, dim)
+
     @onlyOnCPUAndCUDA
     def test_repeated_dim(self, device):
         ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var,
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 9f3353376913..95287da20755 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -9,7 +9,7 @@
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, torch_to_numpy_dtype_dict, suppress_warnings,
-     TEST_NUMPY, make_tensor)
+     TEST_NUMPY, IS_MACOS, make_tensor)
 from torch.testing._internal.common_methods_invocations import \
     (unary_ufuncs)
 from torch.testing._internal.common_device_type import \
@@ -480,6 +480,16 @@ def test_nan_to_num(self, device, dtype):
             torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf)
             self.assertEqual(result, out)
 
+    @unittest.skipIf(IS_MACOS, "Skip Reference: https://github.com/pytorch/pytorch/issues/47500")
+    @dtypes(torch.cfloat, torch.cdouble)
+    def test_sqrt_complex_edge_values(self, device, dtype):
+        # Test Reference: https://github.com/pytorch/pytorch/pull/47424
+        x = torch.tensor(0. - 1.0000e+20j, dtype=dtype, device=device)
+        self.compare_with_numpy(torch.sqrt, np.sqrt, x)
+
+        x = torch.tensor(-1.0000e+20 - 4988429.2000j, dtype=dtype, device=device)
+        self.compare_with_numpy(torch.sqrt, np.sqrt, x)
+
 instantiate_device_type_tests(TestUnaryUfuncs, globals())
 
 if __name__ == '__main__':
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 4b441d6f3616..15fd600e441c 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -56,7 +56,7 @@ static PyObject * THPVariable__is_view(PyObject *self, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "_is_view");
+    return handle_torch_function(self, "_is_view", args);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.is_view()) {
@@ -160,7 +160,7 @@ static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self_)) {
-    return handle_torch_function(self_, "get_device");
+    return handle_torch_function(self_, "get_device", args, nullptr);
   }
   auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;
   return wrap(self.get_device());
@@ -171,7 +171,7 @@ static PyObject * THPVariable_has_names(PyObject* self_, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self_)) {
-    return handle_torch_function(self_, "has_names");
+    return handle_torch_function(self_, "has_names", args);
   }
   auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;
   return wrap(self.has_names());
@@ -183,7 +183,7 @@ static PyObject * THPVariable_data_ptr(PyObject* self_, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self_)) {
-    return handle_torch_function(self_, "data_ptr");
+    return handle_torch_function(self_, "data_ptr", args);
   }
   auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;
   return wrap(self.data_ptr());
@@ -207,7 +207,7 @@ static PyObject * THPVariable_dim(PyObject* self, PyObject* args)
 {
    HANDLE_TH_ERRORS
    if (check_has_torch_function(self)) {
-     return handle_torch_function(self, "dim");
+     return handle_torch_function(self, "dim", args);
    }
    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
    return THPUtils_packInt64(self_.dim());
@@ -219,7 +219,7 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
 {
    HANDLE_TH_ERRORS
    if (check_has_torch_function(self)) {
-     return handle_torch_function(self, "numel");
+     return handle_torch_function(self, "numel", args);
    }
    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
    return THPUtils_packInt64(self_.numel());
@@ -333,7 +333,7 @@ static bool dispatch_to_Bool(const Tensor & self) {
 static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "__float__");
+    return handle_torch_function(self, "__float__", args);
   }
   jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -344,7 +344,7 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
 static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "__complex__");
+    return handle_torch_function(self, "__complex__", args);
   }
   jit::tracer::warn("Converting a tensor to a Python complex", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -355,7 +355,7 @@ static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) {
 static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "__int__");
+    return handle_torch_function(self, "__int__", args);
   }
   jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -374,7 +374,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
 static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "__index__");
+    return handle_torch_function(self, "__index__", args);
   }
   jit::tracer::warn("Converting a tensor to a Python index", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -396,7 +396,7 @@ static Tensor dispatch_invert(const Tensor & self) {
 static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "__invert__");
+    return handle_torch_function(self, "__invert__", args);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (!isIntegralType(self_.scalar_type(), /*includeBool=*/true)) {
@@ -691,7 +691,7 @@ static PyObject * THPVariable_element_size(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "element_size");
+    return handle_torch_function(self, "element_size", args);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   return THPUtils_packInt64(self_.element_size());
@@ -769,7 +769,7 @@ static PyObject * THPVariable_item(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "item");
+    return handle_torch_function(self, "item", args);
   }
   jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -838,7 +838,7 @@ static PyObject * THPVariable_new(PyObject* self, PyObject* args, PyObject* kwar
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "new");
+    return handle_torch_function(self, "new", args, kwargs);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   OptionalDeviceGuard device_guard(device_of(self_));
@@ -850,7 +850,7 @@ static PyObject * THPVariable_new_ones(PyObject* self, PyObject* args, PyObject*
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "new_ones");
+    return handle_torch_function(self, "new_ones", args, kwargs);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   OptionalDeviceGuard device_guard(device_of(self_));
@@ -862,7 +862,7 @@ static PyObject * THPVariable_new_tensor(PyObject* self, PyObject* args, PyObjec
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "new_tensor");
+    return handle_torch_function(self, "new_tensor", args, kwargs);
   }
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   OptionalDeviceGuard device_guard(device_of(self_));
@@ -941,7 +941,7 @@ static PyObject * THPVariable_tolist(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "tolist");
+    return handle_torch_function(self, "tolist", args);
   }
   jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto self_ = reinterpret_cast<THPVariable*>(self)->cdata;
@@ -1010,7 +1010,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
 static PyObject * THPVariable_bool_scalar(PyObject* self, PyObject* args) {
   if (check_has_torch_function(self)) {
     HANDLE_TH_ERRORS
-    return handle_torch_function(self, "__bool__");
+    return handle_torch_function(self, "__bool__", args);
     END_HANDLE_TH_ERRORS
   }
   jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW);
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 2b8f4fc64959..0ed2dff543fe 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -10,6 +10,7 @@
 import pathlib
 import functools
 import json
+from dataclasses import dataclass
 
 from tools.codegen.code_template import CodeTemplate
 from tools.codegen.model import *
@@ -102,13 +103,25 @@ def parse_native_yaml(path: str) -> List[NativeFunction]:
 def with_native_function(func: Callable[[NativeFunction], T]) -> Callable[[NativeFunction], T]:
     @functools.wraps(func)
     def wrapper(f: NativeFunction) -> T:
-        with context(f'in {f.loc}:\n  {f.func}'):
-            with local.parametrize(
-                use_c10_dispatcher=f.use_c10_dispatcher,
-            ):
-                return func(f)
+        with native_function_manager(f):
+            return func(f)
     return wrapper
 
+def method_with_native_function(func: Callable[[S, NativeFunction], T]) -> Callable[[S, NativeFunction], T]:
+    @functools.wraps(func)
+    def wrapper(slf: S, f: NativeFunction) -> T:
+        with native_function_manager(f):
+            return func(slf, f)
+    return wrapper
+
+@contextlib.contextmanager
+def native_function_manager(f: NativeFunction) -> Iterator[None]:
+    with context(f'in {f.loc}:\n  {f.func}'):
+        with local.parametrize(
+            use_c10_dispatcher=f.use_c10_dispatcher,
+        ):
+            yield
+
 # These two functions purposely return generators in analogy to map()
 # so that you don't mix up when you need to list() them
 
@@ -180,49 +193,53 @@ def cpp_string(s: str) -> str:
 #
 # This function is also used for a secondary purpose: the registration
 # logic is also reused to implement per-operator registration.
-def compute_type_method(
-    dispatch: Optional[str], *,
+@dataclass(frozen=True)
+class ComputeTypeMethod:
+    dispatch: Optional[str]
+
     # TODO: Give more precise type Union[Literal[Target.DEFINITION,
     # Target.REGISTRATION]]; requires Literal from typing_extensions
     # which we don't have a dep for yet.
-    target: Target,
+    target: Target
+
     # Selector object to determine which operators to generate
     # registration code for.
     selector: SelectiveBuilder
-) -> Callable[[NativeFunction], Optional[str]]:
 
-    if dispatch is None:
-        assert target is Target.REGISTRATION
+    def __post_init__(self) -> None:
+        assert self.target is not Target.DECLARATION
+        if self.dispatch is None:
+            assert self.target is Target.REGISTRATION
 
-    @with_native_function
-    def func(f: NativeFunction) -> Optional[str]:
-        # Has to be here as mypy won't transfer asserts into closures
-        assert target is not Target.DECLARATION
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
+        # for mypy type refinement; would be fixed by TODO on target
+        assert self.target is not Target.DECLARATION
 
-        if dispatch is not None:
-            if dispatch not in f.dispatch:
+        if self.dispatch is not None:
+            if self.dispatch not in f.dispatch:
                 return None
 
         op_name = f"aten::{f.func.name}"
-        if target is Target.REGISTRATION and not selector.is_operator_selected(op_name):
+        if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
             return None
 
         name = native.name(f.func)
         returns_type = native.returns_type(f.func.returns)
         args = native.arguments(f.func)
         args_str = ', '.join(map(str, args))
-        dispatch_to_all_backends = dispatch is not None and dispatch in KEYWORD_ALL_BACKENDS
+        dispatch_to_all_backends = self.dispatch is not None and self.dispatch in KEYWORD_ALL_BACKENDS
 
-        if target is Target.DEFINITION:
-            assert dispatch is not None
-            impl_name = f"at::native::{f.dispatch[dispatch]}"
+        if self.target is Target.DEFINITION:
+            assert self.dispatch is not None
+            impl_name = f"at::native::{f.dispatch[self.dispatch]}"
 
             args_exprs_str = ', '.join(a.name for a in args)
 
             return_kw = "    return "
 
             cuda_guard = ""
-            if dispatch_to_all_backends or 'CUDA' in dispatch:
+            if dispatch_to_all_backends or 'CUDA' in self.dispatch:
                 self_args = (a for a in f.func.arguments if a.name == "self")
 
                 # There is precedence for which argument we use to do
@@ -249,7 +266,7 @@ def func(f: NativeFunction) -> Optional[str]:
                 # works just as well.
                 if f.device_guard and dispatch_to_all_backends and has_tensor_options:
                     cuda_guard = cuda_guard_from_tensor_options
-                elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options:
+                elif f.device_guard and self.dispatch is not None and 'CUDA' in self.dispatch and has_tensor_options:
                     cuda_guard = f"""\
     globalContext().lazyInitCUDA();
     {cuda_guard_from_tensor_options}
@@ -269,8 +286,8 @@ def func(f: NativeFunction) -> Optional[str]:
 }}
 """
 
-        elif target is Target.REGISTRATION:
-            if dispatch is None:
+        elif self.target is Target.REGISTRATION:
+            if self.dispatch is None:
                 return f'm.def({cpp_string(str(f.func))});\n'
             elif f.manual_kernel_registration:
                 return None
@@ -278,7 +295,7 @@ def func(f: NativeFunction) -> Optional[str]:
                 if dispatch_to_all_backends:
                     type_name = f'TypeDefault::{name}'
                 else:
-                    type_name = f'{dispatch}Type::{name}'
+                    type_name = f'{self.dispatch}Type::{name}'
 
                 dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
@@ -302,21 +319,22 @@ def func(f: NativeFunction) -> Optional[str]:
                 # in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend.  So
                 # the torch::dispatch specification here is important!  See
                 # Note [Redundancy in registration code is OK] for how we handle redundant info.
-                if dispatch is not None:
-                    payload = f"torch::dispatch(DispatchKey::{dispatch},\n{payload})\n"
+                if self.dispatch is not None:
+                    payload = f"torch::dispatch(DispatchKey::{self.dispatch},\n{payload})\n"
 
                 return f'm.impl("{f.func.name}",\n{payload});\n'
         else:
-            assert_never(target)
-
-    return func
+            assert_never(self.target)
 
 # Generates Function.cpp and Function.h.  These files provide the
 # functional public C++ API, and the scaffolding to call into
 # the dispatcher from these functions.  See also compute_tensor_method.
-def compute_function(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
-    @with_native_function
-    def go(f: NativeFunction) -> Optional[str]:
+@dataclass(frozen=True)
+class ComputeFunction:
+    target: Target
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
         if f.manual_kernel_registration:
             return None
         if Variant.function not in f.variants:
@@ -326,13 +344,13 @@ def go(f: NativeFunction) -> Optional[str]:
 
         sig_group = CppSignatureGroup.from_schema(f.func, method=False)
 
-        if target is Target.DECLARATION:
+        if self.target is Target.DECLARATION:
             result = f"CAFFE2_API {sig_group.signature.decl()};\n"
             if sig_group.faithful_signature is not None:
                 result += f"CAFFE2_API {sig_group.faithful_signature.decl()};\n"
             return result
 
-        assert target is Target.DEFINITION
+        assert self.target is Target.DEFINITION
 
         def generate_defn(sig: CppSignature) -> str:
             dispatcher_sig = DispatcherSignature.from_schema(f.func)
@@ -357,14 +375,15 @@ def generate_defn(sig: CppSignature) -> str:
 
         return result
 
-    return go
-
 # Generates TensorBody.h (sic) and TensorMethods.cpp.  These files provide the
 # object-oriented (method-based) public C++ API, and the scaffolding to call into
 # the dispatcher from these functions.  See also compute_function.
-def compute_tensor_method(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
-    @with_native_function
-    def go(f: NativeFunction) -> Optional[str]:
+@dataclass(frozen=True)
+class ComputeTensorMethod:
+    target: Target
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
         if Variant.method not in f.variants:
             return None
 
@@ -376,13 +395,13 @@ def go(f: NativeFunction) -> Optional[str]:
 
         sig_group = CppSignatureGroup.from_schema(f.func, method=True)
 
-        if target is Target.DECLARATION:
+        if self.target is Target.DECLARATION:
             result = f"{sig_group.signature.decl()} const;\n"
             if sig_group.faithful_signature is not None:
                 result += f"{sig_group.faithful_signature.decl()} const;\n"
             return result
 
-        assert target is Target.DEFINITION
+        assert self.target is Target.DEFINITION
 
         def generate_defn(sig: CppSignature) -> str:
             dispatcher_sig = DispatcherSignature.from_schema(f.func)
@@ -406,8 +425,6 @@ def generate_defn(sig: CppSignature) -> str:
 
         return result
 
-    return go
-
 # Generates ATenOpList.cpp, a runtime accessible list of all aten
 # operators.
 # TODO: This was historically used to help some JIT interop code
@@ -442,9 +459,12 @@ def compute_native_function_declaration(f: NativeFunction) -> List[str]:
 # Generates BackendSelectRegister.cpp, a series of kernels which provide
 # specialized computation of dispatch key for operator signatures which cannot
 # be easily done automatically using templating.
-def compute_backend_select(*, target: Target) -> Callable[[NativeFunction], Optional[str]]:
-    @with_native_function
-    def go(f: NativeFunction) -> Optional[str]:
+@dataclass(frozen=True)
+class ComputeBackendSelect:
+    target: Target
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> Optional[str]:
         if str(f.func.name.name).endswith('_like') or str(f.func.name.name).startswith('new_'):
             return None
 
@@ -471,7 +491,7 @@ def go(f: NativeFunction) -> Optional[str]:
             dispatcher_exprs = native_sig.dispatcher_exprs()
             dispatch_key = "options.computeDispatchKey()"
 
-        if target is Target.DEFINITION:
+        if self.target is Target.DEFINITION:
             # I don't think there's actually a good reason to generate
             # these two cases differently
             # The first case could probably be improved though- it calls dispatchTypeId(),
@@ -494,7 +514,7 @@ def go(f: NativeFunction) -> Optional[str]:
   return op.callWithDispatchKey(_dk, {', '.join(a.expr for a in dispatcher_exprs)});
 }}
 """
-        elif target is Target.REGISTRATION:
+        elif self.target is Target.REGISTRATION:
             if local.use_c10_dispatcher() is UseC10Dispatcher.full:
                 return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
             elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
@@ -504,11 +524,10 @@ def go(f: NativeFunction) -> Optional[str]:
             else:
                 assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
                 return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});"""
-        elif target is Target.DECLARATION:
+        elif self.target is Target.DECLARATION:
             raise AssertionError()
         else:
-            assert_never(target)
-    return go
+            assert_never(self.target)
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
@@ -993,12 +1012,11 @@ def make_file_manager(install_dir: str) -> FileManager:
                 '',
             'Backend': dispatch,
             'type_derived_method_definitions': list(mapMaybe(
-                compute_type_method(dispatch, target=Target.DEFINITION, selector=selector),
+                ComputeTypeMethod(dispatch, Target.DEFINITION, selector),
                 native_functions
             )),
             'function_registrations': list(mapMaybe(
-                compute_type_method(
-                    dispatch, target=Target.REGISTRATION, selector=selector),
+                ComputeTypeMethod(dispatch, Target.REGISTRATION, selector),
                 native_functions
             )),
         })
@@ -1012,35 +1030,35 @@ def make_file_manager(install_dir: str) -> FileManager:
     cpu_fm.write('TypeDefault.cpp', lambda: {
         'type_method_definitions':
         list(mapMaybe(
-            compute_type_method('Math', target=Target.DEFINITION, selector=selector),
+            ComputeTypeMethod('Math', Target.DEFINITION, selector),
             native_functions)) +
         list(mapMaybe(
-            compute_type_method('DefaultBackend', target=Target.DEFINITION, selector=selector),
+            ComputeTypeMethod('DefaultBackend', Target.DEFINITION, selector),
             native_functions)),
 
         'function_registrations': list(mapMaybe(
-            compute_type_method(None, target=Target.REGISTRATION, selector=schema_selector),
+            ComputeTypeMethod(None, Target.REGISTRATION, schema_selector),
             native_functions)),
 
         'math_function_registrations': list(mapMaybe(
-            compute_type_method('Math', target=Target.REGISTRATION, selector=selector),
+            ComputeTypeMethod('Math', Target.REGISTRATION, selector),
             native_functions)),
 
         'default_backend_function_registrations': list(mapMaybe(
-            compute_type_method('DefaultBackend', target=Target.REGISTRATION, selector=selector),
+            ComputeTypeMethod('DefaultBackend', Target.REGISTRATION, selector),
             native_functions)),
     })
     cpu_fm.write('Functions.h', lambda: {
-        'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)),
+        'function_declarations': list(mapMaybe(ComputeFunction(Target.DECLARATION), native_functions)),
     })
     cpu_fm.write('Functions.cpp', lambda: {
-        'function_definitions': list(mapMaybe(compute_function(target=Target.DEFINITION), native_functions)),
+        'function_definitions': list(mapMaybe(ComputeFunction(Target.DEFINITION), native_functions)),
     })
     core_fm.write('TensorBody.h', lambda: {
-        'tensor_method_declarations': list(mapMaybe(compute_tensor_method(target=Target.DECLARATION), native_functions)),
+        'tensor_method_declarations': list(mapMaybe(ComputeTensorMethod(Target.DECLARATION), native_functions)),
     })
     core_fm.write('TensorMethods.cpp', lambda: {
-        'tensor_method_definitions': list(mapMaybe(compute_tensor_method(target=Target.DEFINITION), native_functions)),
+        'tensor_method_definitions': list(mapMaybe(ComputeTensorMethod(Target.DEFINITION), native_functions)),
     })
     core_fm.write('ATenOpList.cpp', lambda: {
         'aten_ops': list(mapMaybe(compute_aten_op, native_functions)),
@@ -1050,9 +1068,9 @@ def make_file_manager(install_dir: str) -> FileManager:
     })
     cpu_fm.write('BackendSelectRegister.cpp', lambda: {
         'backend_select_method_definitions':
-            list(mapMaybe(compute_backend_select(target=Target.DEFINITION), native_functions)),
+            list(mapMaybe(ComputeBackendSelect(Target.DEFINITION), native_functions)),
         'backend_select_function_registrations':
-            list(mapMaybe(compute_backend_select(target=Target.REGISTRATION), native_functions)),
+            list(mapMaybe(ComputeBackendSelect(Target.REGISTRATION), native_functions)),
     })
 
     cpu_fm.write('Declarations.yaml', lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions]))
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 9d8e2e73693b..3d084d5edb32 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -391,28 +391,6 @@ def __post_init__(self) -> None:
                     '_foreach_div_.List',
                     '_foreach_exp_',
                     '_foreach_sqrt_',
-                    '_foreach_abs_',
-                    '_foreach_acos_',
-                    '_foreach_asin_',
-                    '_foreach_atan_',
-                    '_foreach_ceil_',
-                    '_foreach_cos_',
-                    '_foreach_cosh_',
-                    '_foreach_erf_',
-                    '_foreach_erfc_',
-                    '_foreach_expm1_',
-                    '_foreach_floor_',
-                    '_foreach_log_',
-                    '_foreach_log10_',
-                    '_foreach_log1p_',
-                    '_foreach_log2_',
-                    '_foreach_neg_',
-                    '_foreach_tan_',
-                    '_foreach_tanh_',
-                    '_foreach_sin_',
-                    '_foreach_sinh_',
-                    '_foreach_round_',
-                    '_foreach_lgamma_',
                     '_foreach_addcmul_.Scalar',
                     '_foreach_addcdiv_.Scalar',
                     '_foreach_addcmul_.ScalarList',
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index eba7368cb03e..ba7d44814421 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -151,7 +151,7 @@ def self_cpu_time_total(self):
     def cpu_children_populated(self):
         return self._cpu_children_populated
 
-    def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only=False):
+    def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False):
         """Prints an EventList as a nicely formatted table.
 
         Arguments:
@@ -173,6 +173,7 @@ def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only=
             self,
             sort_by=sort_by,
             row_limit=row_limit,
+            max_src_column_width=max_src_column_width,
             header=header,
             use_cuda=self._use_cuda,
             profile_memory=self._profile_memory,
@@ -420,11 +421,11 @@ def _check_finish(self):
             raise RuntimeError("can't export a trace that didn't finish running")
         self.function_events.populate_cpu_children()
 
-    def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only=False):
+    def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False):
         self._check_finish()
         assert self.function_events is not None
         return self.function_events.table(
-            sort_by=sort_by, row_limit=row_limit, header=header,
+            sort_by=sort_by, row_limit=row_limit, max_src_column_width=max_src_column_width, header=header,
             top_level_events_only=top_level_events_only
         )
     table.__doc__ = EventList.table.__doc__
@@ -1165,6 +1166,7 @@ def build_table(
         sort_by=None,
         header=None,
         row_limit=100,
+        max_src_column_width=75,
         use_cuda=True,
         profile_memory=False,
         top_level_events_only=False):
@@ -1195,7 +1197,7 @@ def build_table(
     has_stack = len(stacks) > 0
     if has_stack:
         src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
-        src_column_width = min(src_column_width, 75)
+        src_column_width = min(src_column_width, max_src_column_width)
 
     headers = [
         'Name',
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index d9ddf35ee1df..e9d8f618eb21 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/python_headers.h>
 
+#include <c10/util/intrusive_ptr.h>
 #include <c10d/FileStore.hpp>
 #ifndef _WIN32
 #include <c10d/HashStore.hpp>
@@ -59,6 +60,8 @@ constexpr auto kDeprecationWarning =
     "{} API is being deprecated, please ping "
     "https://github.com/pytorch/pytorch/issues/46291 "
     "if you see this warning";
+template <typename T>
+using intrusive_ptr_class_ = py::class_<T, c10::intrusive_ptr<T>>;
 
 // PythonStore is a pybind11 trampoline class to allow a Python
 // class to inherit from c10d.Store and implement its interface.
@@ -339,7 +342,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
       .def_readwrite("timeout", &::c10d::AllToAllOptions::timeout);
 
   auto store =
-      py::class_<::c10d::Store, std::shared_ptr<::c10d::Store>, PythonStore>(
+      py::class_<::c10d::Store, c10::intrusive_ptr<::c10d::Store>, PythonStore>(
           module,
           "Store",
           R"(
@@ -543,7 +546,7 @@ Example::
     >>> store.wait(["bad_key"], timedelta(seconds=10))
 )");
 
-  shared_ptr_class_<::c10d::FileStore>(
+  intrusive_ptr_class_<::c10d::FileStore>(
       module,
       "FileStore",
       store,
@@ -566,7 +569,7 @@ Example::
       .def(py::init<const std::string&, int>());
 
 #ifndef _WIN32
-  shared_ptr_class_<::c10d::HashStore>(
+  intrusive_ptr_class_<::c10d::HashStore>(
       module,
       "HashStore",
       store,
@@ -583,7 +586,7 @@ Example::
       )")
       .def(py::init<>());
 
-  shared_ptr_class_<::c10d::TCPStore>(
+  intrusive_ptr_class_<::c10d::TCPStore>(
       module,
       "TCPStore",
       store,
@@ -623,7 +626,7 @@ Example::
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
 #endif
 
-  shared_ptr_class_<::c10d::PrefixStore>(
+  intrusive_ptr_class_<::c10d::PrefixStore>(
       module,
       "PrefixStore",
       store,
@@ -636,7 +639,7 @@ that adds a prefix to each key inserted to the store.
     prefix (str): The prefix string that is prepended to each key before being inserted into the store.
     store (torch.distributed.store): A store object that forms the underlying key-value store.
       )")
-      .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
+      .def(py::init<const std::string&, c10::intrusive_ptr<::c10d::Store>>());
 
   auto processGroup =
       shared_ptr_class_<::c10d::ProcessGroup>(module, "ProcessGroup")
@@ -949,13 +952,13 @@ that adds a prefix to each key inserted to the store.
   processGroupGloo
       .def(
           py::init<
-              const std::shared_ptr<::c10d::Store>&,
+              const c10::intrusive_ptr<::c10d::Store>&,
               int,
               int,
               ::c10d::ProcessGroupGloo::Options>(),
           py::call_guard<py::gil_scoped_release>())
       .def(
-          py::init([](const std::shared_ptr<::c10d::Store>& store,
+          py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                       int rank,
                       int size,
                       std::chrono::milliseconds timeout) {
@@ -994,13 +997,13 @@ that adds a prefix to each key inserted to the store.
           module, "ProcessGroupNCCL", processGroup)
           .def(
               py::init<
-                  const std::shared_ptr<::c10d::Store>&,
+                  const c10::intrusive_ptr<::c10d::Store>&,
                   int,
                   int,
                   ::c10d::ProcessGroupNCCL::Options>(),
               py::call_guard<py::gil_scoped_release>())
           .def(
-              py::init([](const std::shared_ptr<::c10d::Store>& store,
+              py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                           int rank,
                           int size,
                           const std::chrono::milliseconds& timeout) {
@@ -1045,7 +1048,7 @@ that adds a prefix to each key inserted to the store.
       py::call_guard<py::gil_scoped_release>());
 #endif
 
-  shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
+  intrusive_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
       .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted)
       .def(
           "is_success",
@@ -1165,7 +1168,7 @@ that adds a prefix to each key inserted to the store.
       // Python side of the world. Calling Python functions on a Python object
       // completely bypasses pybind11. We need to test that the overloaded
       // functions call into Python and behave like we expect.
-      [](std::shared_ptr<::c10d::Store> store) {
+      [](c10::intrusive_ptr<::c10d::Store> store) {
         auto add = [&store](const std::string& key, int64_t value) {
           store->add(key, value);
         };
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 1d82a619ed7e..81af4abebd5f 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -576,7 +576,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
 
   shared_ptr_class_<TensorPipeAgent>(module, "TensorPipeAgent", rpcAgent)
       .def(
-          py::init([](const std::shared_ptr<::c10d::Store>& store,
+          py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
                       std::string selfName,
                       worker_id_t selfId,
                       int worldSize,
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index 2f29adc8f0c4..13e685b8fe74 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -398,7 +398,7 @@ void ProcessGroupAgent::handleSend(const SendWork& work) {
 
   // ProcessGroup is not thread-safe when sending with the same tag,
   // hence the lock
-  std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> pendingSends;
+  std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> pendingSends;
   const auto dst = work.to_.id_;
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h
index 1bc8db9ebf20..70fb1b40244d 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.h
+++ b/torch/csrc/distributed/rpc/process_group_agent.h
@@ -230,14 +230,14 @@ class TORCH_API ProcessGroupAgent : public RpcAgent {
   // Lock and shared ptr to currently pending work, set in listenloop() and
   // interruptible in shutdown().
   std::mutex recvWorkMutex_;
-  std::shared_ptr<c10d::ProcessGroup::Work> recvWork_;
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> recvWork_;
   // Map of dst rank to current oustanding sends that we are waiting on. In the
   // case of a call to ::shutdown() while we are still waiting on these sends,
   // the pending sends contained in this map will be aborted, allowing the
   // waiting thread to be unblocked.
   std::unordered_map<
       worker_id_t,
-      std::set<std::shared_ptr<c10d::ProcessGroup::Work>>>
+      std::set<c10::intrusive_ptr<c10d::ProcessGroup::Work>>>
       currentPendingSends_;
   // Lock to serialize access to the above map.
   std::mutex pendingSendMutex_;
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 6bf65f4c2628..eff1e7ebdf21 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -220,7 +220,7 @@ void TensorPipeAgent::collectNames() {
 }
 
 TensorPipeAgent::TensorPipeAgent(
-    const std::shared_ptr<::c10d::Store>& store,
+    const c10::intrusive_ptr<::c10d::Store>& store,
     std::string selfName,
     worker_id_t selfId,
     int worldSize,
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index b4a500de65be..b8c9a8c64e5c 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -141,7 +141,7 @@ struct AggregatedNetworkData {
 class TensorPipeAgent : public RpcAgent {
  public:
   TensorPipeAgent(
-      const std::shared_ptr<::c10d::Store>& store,
+      const c10::intrusive_ptr<::c10d::Store>& store,
       std::string selfName,
       worker_id_t selfId,
       int worldSize,
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 76b6f1d234ba..f4060a6c0e74 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/jit_log.h>
 
 #include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/graph_executor_impl.h>
 
@@ -75,6 +76,7 @@ class AttributePropagator {
   void run() {
     auto applyInline = [](std::shared_ptr<Graph>& subgraph) {
       Inline(*subgraph);
+      ClearProfilingInformation(subgraph);
     };
     auto applyOptimizations = [](std::shared_ptr<Graph>& subgraph) {
       runOptimization(subgraph, /* unroll? */ false);
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 15c1cdd272b2..7630ad320adf 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -796,8 +796,7 @@ inline IValue toIValue(
       return c10::ivalue::ConcretePyObjectHolder::create(obj);
     }
     case TypeKind::CapsuleType: {
-      return IValue::make_capsule(
-          py::cast<c10::intrusive_ptr<CustomClassHolder>>(obj));
+      return IValue::make_capsule(py::cast<c10::Capsule>(obj).obj_ptr);
     }
     case TypeKind::FutureType: {
       return obj.cast<std::shared_ptr<PythonFutureWrapper>>()->fut;
@@ -1002,7 +1001,7 @@ inline py::object toPyObject(IValue ivalue) {
     // PyObject
     return py::reinterpret_borrow<py::object>(ivalue.toPyObject());
   } else if (ivalue.isCapsule()) {
-    return py::cast(ivalue.toCapsule());
+    return py::cast(c10::Capsule(ivalue.toCapsule()));
   } else if (ivalue.isFuture()) {
     return py::cast(std::make_shared<PythonFutureWrapper>(ivalue.toFuture()));
   } else if (ivalue.isEnum()) {
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index a99f7469ac65..7c571384e481 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -714,7 +714,7 @@ void initJitScriptBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
-  py::class_<c10::intrusive_ptr<CustomClassHolder>>(m, "Capsule");
+  py::class_<c10::Capsule>(m, "Capsule");
 
   auto object_class =
       py::class_<Object>(m, "ScriptObject")
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 7df518f404c5..1447508535e5 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -17,6 +17,11 @@
 
 namespace py = pybind11;
 
+// This makes intrusive_ptr to be available as a custom pybind11 holder type,
+// see
+// https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers
+PYBIND11_DECLARE_HOLDER_TYPE(T, c10::intrusive_ptr<T>, true);
+
 namespace pybind11 { namespace detail {
 
 // torch.autograd.Variable <-> at::Tensor conversions (without unwrapping)
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index ff94b1f5ceca..950e7d9fb82d 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -139,7 +139,7 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
 auto handle_torch_function_getter(THPVariable* self, const std::string& property_name) -> PyObject* {
   py::object torch_api = PyObject_FastGetAttrString(THPVariableClass, (char*)property_name.c_str());
   std::string module_name = "torch.Tensor." + property_name;
-  return handle_torch_function((PyObject *)self, "__get__", nullptr, torch_api.ptr(), module_name);
+  return handle_torch_function((PyObject *)self, "__get__", nullptr, nullptr, torch_api.ptr(), module_name);
 }
 
 auto handle_torch_function_setter(THPVariable* self, const std::string& property_name, PyObject* value) -> int {
@@ -148,10 +148,10 @@ auto handle_torch_function_setter(THPVariable* self, const std::string& property
   if (value != nullptr)
   {
     py::tuple args_ = py::make_tuple(py::handle(value));
-    handle_torch_function((PyObject *)self, "__set__", args_.ptr(), torch_api.ptr(), module_name);
+    handle_torch_function((PyObject *)self, "__set__", args_.ptr(), nullptr, torch_api.ptr(), module_name);
   }
   else {
-    handle_torch_function((PyObject *)self, "__delete__", nullptr, torch_api.ptr(), module_name);
+    handle_torch_function((PyObject *)self, "__delete__", nullptr, nullptr, torch_api.ptr(), module_name);
   }
   return 0;
 }
@@ -175,13 +175,13 @@ auto combine_self_args(PyObject *self, PyObject *args) -> py::tuple {
   return args_;
 }
 
-auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args, PyObject* torch_api, const std::string& module_name) -> PyObject* {
+auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args, PyObject* kwargs, PyObject* torch_api, const std::string& module_name) -> PyObject* {
   py::object torch_api_function = PyObject_FastGetAttrString(torch_api, (char*)func_name.c_str());
   TORCH_INTERNAL_ASSERT(torch_api_function.ptr() != nullptr, "torch API function must exist");
   py::tuple args_ = combine_self_args(self, args);
   py::tuple py_types = py::make_tuple(py::handle(PyObject_Type(self)));
   py::object torch_function = PyObject_FastGetAttrString(self, "__torch_function__");
-  py::object ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function.ptr(), py_types.ptr(), args_.ptr(), NULL));
+  py::object ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function.ptr(), py_types.ptr(), args_.ptr(), kwargs));
   if (ret.ptr() == nullptr) {
     // if an exception occurred in a user's implementation of
     // __torch_function__, throw it
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 773486f30ee1..b0b81a9517da 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -820,8 +820,8 @@ auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObje
 // Used for functions which needs to parse python args.
 auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject*;
 
-// Used for functions that accept no keyword arguments and have no argument parsing
-auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args=nullptr, PyObject* torch_api=THPVariableClass, const std::string& module_name="torch.Tensor") -> PyObject*;
+// Used for functions that have no argument parsing.
+auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args=nullptr, PyObject* kwargs=nullptr, PyObject* torch_api=THPVariableClass, const std::string& module_name="torch.Tensor") -> PyObject*;
 
 // Used for functions created in C++, e.g., C++ custom op, which doesn't use PythonArgParser to get overloaded_args.
 auto handle_torch_function_no_python_arg_parser(const std::vector<py::handle> &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name) -> PyObject*;
diff --git a/torch/functional.py b/torch/functional.py
index 3781b73a178e..e26b4c1b4125 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -262,76 +262,102 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
 def einsum(equation, *operands):
     r"""einsum(equation, *operands) -> Tensor
 
-This function provides a way of computing multilinear expressions (i.e. sums of products) using the
-Einstein summation convention.
-
-Args:
-    equation (string): The equation is given in terms of lower case letters (indices) to be associated
-           with each dimension of the operands and result. The left hand side lists the operands
-           dimensions, separated by commas. There should be one index letter per tensor dimension.
-           The right hand side follows after `->` and gives the indices for the output.
-           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
-           sorted list of all indices appearing exactly once in the left hand side.
-           The indices not apprearing in the output are summed over after multiplying the operands
-           entries.
-           If an index appears several times for the same operand, a diagonal is taken.
-           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
-           the ellipsis dimensions are at the beginning of the output.
-    operands (Tensor): The operands to compute the Einstein sum of.
-
-.. note::
-
-    This function does not optimize the given expression, so a different formula for the same computation may
-    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
-    can optimize the formula for you.
-
-Examples::
-
-    >>> x = torch.randn(5)
-    >>> y = torch.randn(4)
-    >>> torch.einsum('i,j->ij', x, y)  # outer product
-    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
-            [ 1.2616,  0.6335,  0.5113, -0.4351],
-            [ 1.4452,  0.7257,  0.5857, -0.4984],
-            [-0.4647, -0.2333, -0.1883,  0.1603],
-            [-1.1130, -0.5588, -0.4510,  0.3838]])
-
-
-    >>> A = torch.randn(3,5,4)
-    >>> l = torch.randn(2,5)
-    >>> r = torch.randn(2,4)
-    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
-    tensor([[-0.3430, -5.2405,  0.4494],
-            [ 0.3311,  5.5201, -3.0356]])
-
-
-    >>> As = torch.randn(3,2,5)
-    >>> Bs = torch.randn(3,5,4)
-    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
-    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
-             [-1.6706, -0.8097, -0.8025, -2.1183]],
-
-            [[ 4.2239,  0.3107, -0.5756, -0.2354],
-             [-1.4558, -0.3460,  1.5087, -0.8530]],
-
-            [[ 2.8153,  1.8787, -4.3839, -1.2112],
-             [ 0.3728, -2.1131,  0.0921,  0.8305]]])
-
-    >>> A = torch.randn(3, 3)
-    >>> torch.einsum('ii->i', A) # diagonal
-    tensor([-0.7825,  0.8291, -0.1936])
-
-    >>> A = torch.randn(4, 3, 3)
-    >>> torch.einsum('...ii->...i', A) # batch diagonal
-    tensor([[-1.0864,  0.7292,  0.0569],
-            [-0.9725, -1.0270,  0.6493],
-            [ 0.5832, -1.1716, -1.5084],
-            [ 0.4041, -1.1690,  0.8570]])
-
-    >>> A = torch.randn(2, 3, 4, 5)
-    >>> torch.einsum('...ij->...ji', A).shape # batch permute
-    torch.Size([2, 3, 5, 4])
-"""
+    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
+    based on the Einstein summation convention.
+
+    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
+    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of 
+    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
+    with some subscript and define which subscripts are part of the output. The output is then computed by summing
+    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
+    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
+    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
+
+    Equation:
+
+        The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of
+        the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a
+        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
+        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
+        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
+        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
+        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
+        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
+        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
+
+        Optionally, the output subscripts can be explictly defined by adding an arrow ('->') at the end of the equation
+        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
+        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
+        at most once for the output.
+
+        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
+        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
+        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
+        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
+        'shape' of the ellipsis (the size of the dimensions covered by them) must be broadcastable. In implicit mode,
+        the ellipsis will come first in the output. In explicit mode, if an ellipses covers at least one dimension then
+        it must appear in the output since the dimensions under the ellipsis cannot be summed over. e.g. the following
+        equation implements batch matrix multiplication `'...ij,...jk->...ik'`.
+
+        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
+        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
+
+    .. note::
+
+        This function does not optimize the given expression, so a different formula for the same computation may
+        run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
+        can optimize the formula for you.
+
+    Args:
+        equation (string): The subscripts for the Einstein summation.
+        operands (Tensor): The operands to compute the Einstein sum of.
+
+    Examples::
+
+        # trace
+        >>> torch.einsum('ii', torch.randn(4, 4))
+        tensor(-1.2104)
+
+        # diagonal
+        >>> torch.einsum('ii->i', torch.randn(4, 4))
+        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
+
+        # outer product
+        >>> x = torch.randn(5)
+        >>> y = torch.randn(4)
+        >>> torch.einsum('i,j->ij', x, y)
+        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
+                [-0.3744,  0.9381,  1.2685, -1.6070],
+                [ 0.7208, -1.8058, -2.4419,  3.0936],
+                [ 0.1713, -0.4291, -0.5802,  0.7350],
+                [ 0.5704, -1.4290, -1.9323,  2.4480]])
+
+        # batch matrix multiplication
+        >>> As = torch.randn(3,2,5)
+        >>> Bs = torch.randn(3,5,4)
+        >>> torch.einsum('bij,bjk->bik', As, Bs)
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        # batch permute
+        >>> A = torch.randn(2, 3, 4, 5)
+        >>> torch.einsum('...ij->...ji', A).shape 
+        torch.Size([2, 3, 5, 4])
+
+        # equivalent to torch.nn.functional.bilinear
+        >>> A = torch.randn(3,5,4)
+        >>> l = torch.randn(2,5)
+        >>> r = torch.randn(2,4)
+        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
+        tensor([[-0.3430, -5.2405,  0.4494],
+                [ 0.3311,  5.5201, -3.0356]])
+    """
     if not torch.jit.is_scripting():
         if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
             return handle_torch_function(einsum, operands, equation, *operands)
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index c7fbd6fbf0ea..f3804c515612 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -44,7 +44,8 @@ def forward(self, x):
 The semantics are as follows:
 
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
-  `target` is similarly the name of the argument. `args` and `kwargs` are don't-care. Placeholders correspond to
+  `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument
+  denoting the default parameter of the function input. `kwargs` is don't-care. Placeholders correspond to
   the function parameters (e.g. `x`) in the graph printout.
 - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
index 0e3b7b566ac0..7bd303f55d04 100644
--- a/torch/fx/experimental/GraphManipulation.py
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -1,9 +1,12 @@
-from typing import Dict, List, NamedTuple
-from torch.fx.graph_module import GraphModule
-from torch.fx.node import Node, Target, map_arg
-from torch.fx.graph import Graph
+import json
+from typing import Dict, List, NamedTuple, Any
+
 import torch
 from torch.fx.experimental.shape_prop import ShapeProp
+from torch.fx.graph import Graph, get_qualified_name
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node, Target, map_arg
+
 
 def replace_target_nodes_with(
     fx_module: GraphModule,
@@ -15,22 +18,26 @@ def replace_target_nodes_with(
     """Modifies all nodes in fx_module.graph.nodes which match the specified op code and target,
     and updates them to match the new op code and target"""
     new_graph = Graph()
-    val_map : Dict[Node, Node] = {}
+    val_map: Dict[Node, Node] = {}
     for node in fx_module.graph.nodes:
         if node.op == old_op and node.target == old_target:
             args = map_arg(node.args, lambda n: val_map[n])
             kwargs = map_arg(node.kwargs, lambda n: val_map[n])
             assert isinstance(args, tuple)
             assert isinstance(kwargs, dict)
-            val_map[node] = new_graph.create_node(new_op, new_target, args, kwargs, node.name)
+            val_map[node] = new_graph.create_node(
+                new_op, new_target, args, kwargs, node.name
+            )
         else:
-            val_map[node] = new_graph.node_copy(node, lambda n : val_map[n])
+            val_map[node] = new_graph.node_copy(node, lambda n: val_map[n])
     fx_module.graph = new_graph
 
+
 class size_bytes(NamedTuple):
     output_size: int
     total_size: int
 
+
 def get_size_of_all_nodes(fx_module: GraphModule, args: List[torch.Tensor]) -> None:
     """Given a fx graph module, update each node with its total size (weights + bias + output)
     and its output_size(output). For a non-module node, the total size is the output size.
@@ -40,19 +47,20 @@ def get_size_of_all_nodes(fx_module: GraphModule, args: List[torch.Tensor]) -> N
     # Calculate the total size of the whole fx graph
     total_size_of_graph = 0.0
     for node in fx_module.graph.nodes:
-        if node.op == 'output':
+        if node.op == "output":
             break
         node.size_bytes = get_size_of_node(fx_module, node)
     return
 
+
 def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
     """Given a node with node.dtype and node.shape, return its total size and its output size.
-       total_size = weights + bias + output_size
+    total_size = weights + bias + output_size
     """
     # Total num of elements
     total_num_of_elems = 0
     # For a module, conside all parameters
-    if node.op == 'call_module':
+    if node.op == "call_module":
         submodule_dict = dict(fx_module.named_modules())
         submodule = submodule_dict[node.target]
         parameters = submodule.named_parameters()
@@ -61,18 +69,165 @@ def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
             total_num_of_elems += p.numel()
     # Don't forget the output size
     # node.shape is the shape of this node's output
-    shape = getattr(node, 'shape', None)
+    shape = getattr(node, "shape", None)
     if shape:
         output_elem = shape.numel()
     else:
-        raise RuntimeError('Node has no shape attr')
+        raise RuntimeError("Node has no shape attr")
     total_num_of_elems += output_elem
     size_per_elem_bytes = 0
-    dtype = getattr(node, 'dtype', None)
+    dtype = getattr(node, "dtype", None)
     if dtype:
         size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
     else:
-        raise RuntimeError('Node has no dtype attr')
+        raise RuntimeError("Node has no dtype attr")
     total_size = size_per_elem_bytes * total_num_of_elems
     output_size = size_per_elem_bytes * output_elem
     return size_bytes(output_size, total_size)
+
+
+def serialize_shape(shape: torch.Size) -> str:
+    return str(list(shape))
+
+
+def serialize_tensor_quantization(tensor: torch.Tensor) -> Dict[str, Any]:
+    scheme = {}  # type: Dict[str, Any]
+    if tensor.is_quantized:
+        scheme["q_scheme"] = str(tensor.qscheme())
+        if tensor.qscheme() in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+            scheme["q_scale"] = tensor.q_scale()
+            scheme["q_zero_pont"] = tensor.q_zero_point()
+        if tensor.qscheme() in {
+            torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams,
+            torch.per_channel_symmetric,
+        }:
+            scheme["q_per_channel_scales"] = tensor.q_per_channel_scales().tolist()
+            scheme[
+                "q_per_channel_zero_points"
+            ] = tensor.q_per_channel_zero_points().tolist()
+            scheme["q_per_channel_axis"] = tensor.q_per_channel_axis()
+
+    return scheme
+
+
+def serialize_weight(tensor: torch.Tensor) -> Dict:
+    weight = {}  # type: Dict[str, Any]
+    weight["dtype"] = str(tensor.dtype)
+    weight["is_quantized"] = tensor.is_quantized
+    if tensor.is_quantized:
+        weight["quantized_type"] = serialize_tensor_quantization(tensor)
+    weight["shape"] = serialize_shape(tensor.shape)
+    return weight
+
+
+def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> Dict:
+    """Recursively Serializes a graph module (fx_module) to a dictionary which is later exported to JSON.
+    It also adds all weights the provided weights dictionary by qualified_name.
+    Dictionary Schema:
+    MODULE
+    {
+        modules: {module_name: MODULE],
+        nodes: [NODE],
+        weights {qualified_name: WEIGHT},
+    }
+    NODE
+    {
+        shape: [],
+        dtype: dtype,
+        target: target,
+        op_code: op_code,
+        name: name,
+        args: [],
+        kwargs: {}
+    }
+    WEIGHT
+    {
+        dtype: dtype,
+        is_quantized: bool,
+        shape: [],
+        quantization_info: QUANTIZATION
+    }
+    QUANTIZATION
+    {
+        qscheme: qscheme,
+        q_scale: float,
+        q_zero_point: float,
+        q_per_channel_scales, [],
+        q_per_channel_zero_points: [],
+        q_per_channel_axis, int
+    }
+    """
+    serialized_dict = {}  # type: Dict[str, Any]
+    serialized_dict["modules"] = {}
+    serialized_dict["weights"] = {}
+    serialized_dict["nodes"] = []
+    parameters = fx_module.named_parameters()
+    for name, p in parameters:
+        if isinstance(p, torch.Tensor):
+            weight = serialize_weight(p)
+            prefix = f"{name_prefix}." if name_prefix else ""
+            serialized_dict["weights"][prefix + name] = weight
+            weights[prefix + name] = p
+    for node in fx_module.graph.nodes:
+        node_rep = {}  # type: Dict[str, Any]
+        # Get shape/type info, currently not needed for call_module.
+        if node.op != "call_module":
+            shape = getattr(node, "shape", None)
+            if shape:
+                node_rep["shape"] = serialize_shape(shape)
+            else:
+                raise RuntimeError(
+                    "Node has no shape attr, this is likely because shape propagation has not been run on this Graph."
+                )
+            dtype = getattr(node, "dtype", None)
+            if dtype:
+                node_rep["dtype"] = str(dtype)
+            else:
+                raise RuntimeError(
+                    "Node has no dtype attr, this is likely because shape propagation has not been run on this Graph."
+                )
+
+        # Recurse down into any submodules we are calling.
+        if node.op == "call_module":
+            submodules = dict(fx_module.named_modules())
+            if isinstance(submodules[node.target], GraphModule):
+                serialized_module = serialize_module(
+                    getattr(fx_module, node.target), weights, node.target
+                )
+                serialized_dict["modules"][node.target] = serialized_module
+
+        if node.op == "call_function":
+            node_rep["target"] = get_qualified_name(node.target)
+        else:
+            node_rep["target"] = str(node.target)
+
+        # Make sure we capture all constants.
+        if node.op == "get_attr":
+            target = getattr(fx_module, node.target)
+            prefix = f"{name_prefix}." if name_prefix else ""
+            qualname = prefix + node.target
+            if isinstance(target, torch.Tensor) and qualname not in weights:
+                weight = serialize_weight(target)
+                serialized_dict["weights"][prefix + node.target] = weight
+                weights[prefix + node.target] = target
+
+        node_rep["op_code"] = node.op
+        node_rep["name"] = node.name
+        node_rep["args"] = map_arg(
+            node.args, lambda arg: {"is_node": True, "name": str(arg)}
+        )
+        node_rep["kwargs"] = map_arg(
+            node.kwargs, lambda arg: {"is_node": True, "name": str(arg)}
+        )
+        serialized_dict["nodes"] += [node_rep]
+
+    return serialized_dict
+
+
+class AcceleratedGraphModule:
+    def __init__(self, fx_module: GraphModule):
+        """Creates the needed data structures to pass to the glow runtime"""
+        self.weights = {}  # type: Dict[str, Any]
+        self.serialized_graph = serialize_module(fx_module, self.weights)
+        self.serialized_graph_json = json.dumps(self.serialized_graph, indent=4)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 8a99f772c4c1..dd07ff7a508e 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -16,7 +16,7 @@ def _is_magic(x: str) -> bool:
 def snake_case(s: str) -> str:
     return ''.join(['_' + i.lower() if i.isupper() else i for i in s]).lstrip('_')
 
-def _qualified_name(func: Callable[..., Any]) -> str:
+def get_qualified_name(func: Callable[..., Any]) -> str:
     # things like getattr just appear in builtins
     if getattr(builtins, func.__name__, None) is func:
         return func.__name__
@@ -344,7 +344,8 @@ def type_repr(o : Any):
             if node.op == 'placeholder':
                 assert isinstance(node.target, str)
                 maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
-                free_vars.append(f'{node.target}{maybe_type_annotation}')
+                maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
                 raw_name = node.target.replace('*', '')
                 if raw_name != node.name:
                     body.append(f'{node.name} = {raw_name}\n')
@@ -362,7 +363,7 @@ def type_repr(o : Any):
                     assert isinstance(node.args, tuple)
                     body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n')
                     continue
-                qualified_name = _qualified_name(node.target)
+                qualified_name = get_qualified_name(node.target)
                 register_modules_used(qualified_name)
                 if qualified_name == 'getattr' and \
                    isinstance(node.args, tuple) and \
@@ -384,7 +385,7 @@ def type_repr(o : Any):
             elif node.op == 'output':
                 if node.type is not None:
                     maybe_return_annotation = f" -> {type_repr(node.type)}"
-                body.append(f'return {node.args[0]}')
+                body.append(f'return {repr(node.args[0])}')
                 continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index d1672d332f14..317e039223a0 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -27,6 +27,15 @@ def proxy(self, node: Node) -> 'Proxy':
 
     def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs: Dict[str, Any],
                      name: Optional[str] = None, type_expr : Optional[Any] = None):
+        '''
+        Create a Node from the given arguments, then return the Node
+        wrapped in a Proxy object.
+
+        If kind = 'placeholder', then we're creating a Node that
+        represents the parameter of a function. If we need to encode
+        a default parameter, we use the `args` tuple. `args` is
+        otherwise empty for `placeholder` Nodes.
+        '''
         args_ = self.create_arg(args)
         kwargs_ = self.create_arg(kwargs)
         assert isinstance(args_, tuple)
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 44f0ffba98e0..20566bb58e6e 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -125,8 +125,15 @@ def create_args_for_root(self, root_fn, is_module):
             next(names_iter)  # skip self
             args.append(self.root)
 
+        sig = inspect.signature(fn_for_analysis)
+
         def proxy_placeholder(name: str):
-            return self.create_proxy('placeholder', name, (), {},
+            if name[0] == '*':
+                default = ()    # type: ignore
+            else:
+                param = sig.parameters[name]
+                default = () if param.default is inspect.Parameter.empty else (param.default,)  # type: ignore
+            return self.create_proxy('placeholder', name, default, {},
                                      type_expr=fn_for_analysis.__annotations__.get(name, None))
 
         args.extend(proxy_placeholder(next(names_iter)) for _ in range(skip_arg_idx, total_args))
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index d4f6f96c3da2..b1f6a8bb3571 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -741,6 +741,19 @@ class RecursiveScriptModule(ScriptModule):  # type: ignore
         def __init__(self, arg=None):
             super().__init__()
 
+def call_prepare_scriptable_func(obj):
+    if not isinstance(obj, torch.nn.Module):
+        return obj
+    obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj  # type: ignore
+    for name in obj.__dict__:
+        sub_module = obj.__dict__.get(name)
+        if name == '_modules':
+            for k, v in sub_module.items():
+                sub_module[k] = call_prepare_scriptable_func(v)
+            obj.__setattr__(name, sub_module)
+        elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule):
+            obj.__setattr__(name, call_prepare_scriptable_func(sub_module)) 
+    return obj
 
 def script(obj, optimize=None, _frames_up=0, _rcb=None):
     r"""
@@ -894,6 +907,7 @@ def forward(self, input):
         return obj
 
     if isinstance(obj, torch.nn.Module):
+        obj = call_prepare_scriptable_func(obj) 
         return torch.jit._recursive.create_script_module(
             obj, torch.jit._recursive.infer_methods_to_compile
         )
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index 5f9a3c9c21ec..6f71e422bd0e 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -4,7 +4,7 @@ namespace c10d {
 
 PrefixStore::PrefixStore(
     const std::string& prefix,
-    std::shared_ptr<Store> store)
+    c10::intrusive_ptr<Store> store)
     : prefix_(prefix), store_(store) {}
 
 std::string PrefixStore::joinKey(const std::string& key) {
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index cad7112fbd76..ec50b3b719bf 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -7,7 +7,9 @@ namespace c10d {
 
 class PrefixStore : public Store {
  public:
-  explicit PrefixStore(const std::string& prefix, std::shared_ptr<Store> store);
+  explicit PrefixStore(
+      const std::string& prefix,
+      c10::intrusive_ptr<Store> store);
 
   virtual ~PrefixStore(){};
 
@@ -31,7 +33,7 @@ class PrefixStore : public Store {
 
  protected:
   std::string prefix_;
-  std::shared_ptr<Store> store_;
+  c10::intrusive_ptr<Store> store_;
 
   std::string joinKey(const std::string& key);
   std::vector<std::string> joinKeys(const std::vector<std::string>& keys);
diff --git a/torch/lib/c10d/ProcessGroup.cpp b/torch/lib/c10d/ProcessGroup.cpp
index 3521ed42c840..1d0d451f21a9 100644
--- a/torch/lib/c10d/ProcessGroup.cpp
+++ b/torch/lib/c10d/ProcessGroup.cpp
@@ -164,7 +164,7 @@ ProcessGroup::~ProcessGroup() {}
 
 // This is introduced so that implementors of ProcessGroup would not need to
 // have this implmentation.
-std::shared_ptr<ProcessGroup::Work> ProcessGroup::allgather_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroup::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& /* usused */,
     std::vector<at::Tensor>& /* usused */,
     const AllgatherOptions& /* usused */) {
diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
index 5e90dccc25c0..63996b516a06 100644
--- a/torch/lib/c10d/ProcessGroup.hpp
+++ b/torch/lib/c10d/ProcessGroup.hpp
@@ -70,12 +70,11 @@ bool isP2POp(OpType opType);
 //
 class ProcessGroup {
  public:
-
   // Please do not use ProcessGroup::Work API, it is going away, to be
   // replaced by ivalue::Future.
   // Python binding for this class might change, please do not assume
   // this will be bound using pybind.
-  class Work {
+  class Work : public torch::CustomClassHolder {
    public:
     Work(int rank = -1, OpType opType = OpType::UNKNOWN, const char* profilingTitle = nullptr);
 
@@ -171,25 +170,25 @@ class ProcessGroup {
     return size_;
   }
 
-  virtual std::shared_ptr<ProcessGroup::Work> broadcast(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> allreduce(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& data,
       const AllreduceOptions& opts = AllreduceOptions()) = 0;
 
   // This will be moved out of ProcessGroup, do not add dependencies on this
   // function.
-  virtual std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> reduce(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> allgather(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) = 0;
@@ -197,7 +196,7 @@ class ProcessGroup {
   // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
   // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE.
   // For implementers of ProcessGroup API and advanced users only.
-  virtual std::shared_ptr<ProcessGroup::Work> allgather_base(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) = 0;
@@ -206,27 +205,27 @@ class ProcessGroup {
   // * do not add dependencies on this function,
   // * do not implement it in your ProcessGroup, implement allgather_base
   //   instead.
-  virtual std::shared_ptr<ProcessGroup::Work> allgather_coalesced(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions());
 
-  virtual std::shared_ptr<ProcessGroup::Work> gather(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> scatter(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> alltoall_base(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
@@ -235,28 +234,28 @@ class ProcessGroup {
     throw std::runtime_error("ProcessGroup does not support alltoall");
   }
 
-  virtual std::shared_ptr<ProcessGroup::Work> alltoall(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) {
     throw std::runtime_error("ProcessGroup does not support alltoall");
   }
 
-  virtual std::shared_ptr<ProcessGroup::Work> send(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> recv(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) = 0;
 
-  virtual std::shared_ptr<ProcessGroup::Work> barrier(
+  virtual c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) = 0;
 
  protected:
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index cd3e83e6b714..22da878cce43 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -38,6 +38,7 @@
 #endif
 
 #include <c10/util/StringUtil.h>
+#include <c10/util/intrusive_ptr.h>
 #include <gloo/config.h>
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
@@ -107,7 +108,7 @@ namespace {
 // Wrap c10d store as Gloo store
 class GlooStore : public ::gloo::rendezvous::Store {
  public:
-  GlooStore(const std::shared_ptr<::c10d::Store>& store) : store_(store) {}
+  GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
 
   void set(const std::string& key, const std::vector<char>& value) override {
     std::vector<uint8_t> tmp(value.begin(), value.end());
@@ -130,7 +131,7 @@ class GlooStore : public ::gloo::rendezvous::Store {
   }
 
  protected:
-  std::shared_ptr<::c10d::Store> store_;
+  c10::intrusive_ptr<::c10d::Store> store_;
 };
 
 typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
@@ -561,7 +562,7 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
 #endif
 
 ProcessGroupGloo::ProcessGroupGloo(
-    const std::shared_ptr<Store>& store,
+    const c10::intrusive_ptr<Store>& store,
     int rank,
     int size,
     Options options)
@@ -653,11 +654,11 @@ void ProcessGroupGloo::runLoop(int workerIndex) {
 
     AsyncWork::execute(std::move(work));
     lock.lock();
-    workInProgress_[workerIndex] = nullptr;
+    workInProgress_[workerIndex].reset();
   }
 }
 
-void ProcessGroupGloo::enqueue(std::shared_ptr<AsyncWork> work) {
+void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
   std::unique_lock<std::mutex> lock(workMutex_);
   workQueue_.push_back(std::move(work));
   lock.unlock();
@@ -773,7 +774,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
     std::vector<at::Tensor>& inputs,
     const BroadcastOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -796,15 +797,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
       invalidArgument(c10::str("unsupported device type ", device.type()));
   }
 
-  std::shared_ptr<AsyncBroadcastWork> work;
+  c10::intrusive_ptr<AsyncBroadcastWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncBroadcastWork>(
+    work = c10::make_intrusive<AsyncBroadcastWork>(
         std::move(context), inputs, opts.rootRank, opts.rootTensor, tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncBroadcastCUDAWork>(
+    work = c10::make_intrusive<AsyncBroadcastCUDAWork>(
         std::move(context), inputs, opts.rootRank, opts.rootTensor, tag);
 #endif
   } else {
@@ -1300,7 +1301,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
     std::vector<at::Tensor>& inputs,
     const AllreduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1329,15 +1330,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
         "(allreduce of sparse tensors only works with ReduceOp.SUM)");
   }
 
-  std::shared_ptr<AsyncWork> work;
+  c10::intrusive_ptr<AsyncWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
     if (layout == c10::kStrided) {
-      work = std::make_shared<AsyncAllreduceWork>(
+      work = c10::make_intrusive<AsyncAllreduceWork>(
           std::move(context), inputs, opts.reduceOp, tag);
     } else if (layout == c10::kSparse) {
-      work = std::make_shared<AsyncSparseAllreduceWork>(
+      work = c10::make_intrusive<AsyncSparseAllreduceWork>(
           std::move(context), inputs, tag);
     } else {
       invalidArgument("unsupported layout");
@@ -1345,10 +1346,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
     if (layout == c10::kStrided) {
-      work = std::make_shared<AsyncAllreduceCUDAWork>(
+      work = c10::make_intrusive<AsyncAllreduceCUDAWork>(
           std::move(context), inputs, opts.reduceOp, tag);
     } else if (layout == c10::kSparse) {
-      work = std::make_shared<AsyncSparseAllreduceCUDAWork>(
+      work = c10::make_intrusive<AsyncSparseAllreduceCUDAWork>(
           std::move(context), inputs, tag);
     } else {
       invalidArgument("unsupported layout");
@@ -1362,7 +1363,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
   return work;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1405,12 +1406,12 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce_coalesced(
       invalidArgument("unsupported layout");
   }
 
-  std::shared_ptr<AsyncWork> work;
+  c10::intrusive_ptr<AsyncWork> work;
   const uint32_t tag = nextTag();
   std::shared_ptr<gloo::Context> context = getContext(tag);
   if (device.type() == c10::kCPU) {
     if (layout == c10::kStrided) {
-      work = std::make_shared<AsyncAllreduceCoalescedWork>(
+      work = c10::make_intrusive<AsyncAllreduceCoalescedWork>(
           std::move(context), tensors, opts.reduceOp, tag);
     } else {
       invalidArgument("unsupported layout");
@@ -1538,7 +1539,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
     std::vector<at::Tensor>& inputs,
     const ReduceOptions& opts) {
   static auto invalidArgument = [](const std::string& msg) {
@@ -1561,11 +1562,11 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
       invalidArgument(c10::str("unsupported device type ", device.type()));
   }
 
-  std::shared_ptr<AsyncReduceWork> work;
+  c10::intrusive_ptr<AsyncReduceWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncReduceWork>(
+    work = c10::make_intrusive<AsyncReduceWork>(
         std::move(context),
         inputs,
         opts.rootRank,
@@ -1574,7 +1575,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce(
         tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncReduceCUDAWork>(
+    work = c10::make_intrusive<AsyncReduceCUDAWork>(
         std::move(context),
         inputs,
         opts.rootRank,
@@ -1720,7 +1721,7 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
 
 // Note: current CUDA implementation holds the assumption that the
 // tensors in the nested output tensor vectors are on the same device.
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
@@ -1769,15 +1770,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather(
       invalidArgument(c10::str("unsupported device type ", device.type()));
   }
 
-  std::shared_ptr<AsyncAllgatherWork> work;
+  c10::intrusive_ptr<AsyncAllgatherWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncAllgatherWork>(
+    work = c10::make_intrusive<AsyncAllgatherWork>(
         std::move(context), outputs, inputs, tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncAllgatherCUDAWork>(
+    work = c10::make_intrusive<AsyncAllgatherCUDAWork>(
         std::move(context), outputs, inputs, tag);
 #endif
   } else {
@@ -1852,7 +1853,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& output_lists,
     std::vector<at::Tensor>& input_list,
     const AllgatherOptions& /* unused */) {
@@ -1902,13 +1903,13 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_coalesced(
 
   auto tag = nextTag();
   auto context = getContext(tag);
-  auto work = std::make_shared<AsyncAllgatherCoalescedWork>(
+  auto work = c10::make_intrusive<AsyncAllgatherCoalescedWork>(
       std::move(context), output_lists, input_list, tag);
   enqueue(work);
   return work;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
@@ -2057,7 +2058,7 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::gather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::gather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const GatherOptions& opts) {
@@ -2103,15 +2104,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::gather(
       invalidArgument(c10::str("unsupported device type ", device.type()));
   }
 
-  std::shared_ptr<AsyncGatherWork> work;
+  c10::intrusive_ptr<AsyncGatherWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncGatherWork>(
+    work = c10::make_intrusive<AsyncGatherWork>(
         std::move(context), outputs, inputs, opts.rootRank, tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncGatherCUDAWork>(
+    work = c10::make_intrusive<AsyncGatherCUDAWork>(
         std::move(context), outputs, inputs, opts.rootRank, tag);
 #endif
   } else {
@@ -2245,7 +2246,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ScatterOptions& opts) {
@@ -2290,15 +2291,15 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
       invalidArgument(c10::str("unsupported device type ", device.type()));
   }
 
-  std::shared_ptr<AsyncScatterWork> work;
+  c10::intrusive_ptr<AsyncScatterWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncScatterWork>(
+    work = c10::make_intrusive<AsyncScatterWork>(
         std::move(context), outputs, inputs, opts.rootRank, tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncScatterCUDAWork>(
+    work = c10::make_intrusive<AsyncScatterCUDAWork>(
         std::move(context), outputs, inputs, opts.rootRank, tag);
 #endif
   } else {
@@ -2308,7 +2309,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
   return work;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce_scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ReduceScatterOptions& opts) {
@@ -2443,7 +2444,7 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputCounts,
@@ -2460,12 +2461,12 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
   assertDense(invalidArgument, {inputTensor});
 
   const auto& device = outputTensor.device();
-  std::shared_ptr<AsyncAlltoallWork> work;
+  c10::intrusive_ptr<AsyncAlltoallWork> work;
   auto tag = nextTag();
   auto context = getContext(tag);
 
   if (device.type() == at::kCPU) {
-    work = std::make_shared<AsyncAlltoallWork>(
+    work = c10::make_intrusive<AsyncAlltoallWork>(
         std::move(context),
         outputTensor,
         inputTensor,
@@ -2474,7 +2475,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::alltoall_base(
         tag);
 #ifdef USE_CUDA
   } else if (device.type() == at::kCUDA) {
-    work = std::make_shared<AsyncAlltoallCUDAWork>(
+    work = c10::make_intrusive<AsyncAlltoallCUDAWork>(
         std::move(context),
         outputTensor,
         inputTensor,
@@ -2510,7 +2511,7 @@ uint32_t checkTag(int32_t tag) {
   return (uint32_t)tag;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
@@ -2526,10 +2527,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the send.
-  return std::make_shared<SendWork>(tensor, std::move(buf));
+  return c10::make_intrusive<SendWork>(tensor, std::move(buf));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
@@ -2545,10 +2546,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
-  return std::make_shared<RecvWork>(tensor, std::move(buf));
+  return c10::make_intrusive<RecvWork>(tensor, std::move(buf));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
     std::vector<at::Tensor>& tensors,
     int tag) {
   auto& tensor = checkSingleTensor(tensors);
@@ -2573,7 +2574,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
-  return std::make_shared<RecvWork>(tensor, std::move(buf));
+  return c10::make_intrusive<RecvWork>(tensor, std::move(buf));
 }
 
 namespace {
@@ -2582,13 +2583,13 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncBarrierWork(
       const std::shared_ptr<gloo::Context>& context,
-      std::vector<std::weak_ptr<AsyncWork>> priorWork,
+      std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork,
       uint32_t tag)
       : ProcessGroupGloo::AsyncWork("gloo:barrier"),
         context(context), priorWork(std::move(priorWork)), tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<std::weak_ptr<AsyncWork>> priorWork;
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
   const uint32_t tag;
 
   void run() override {
@@ -2608,9 +2609,9 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
 
 } // namespace
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier(
     const BarrierOptions& opts) {
-  std::vector<std::weak_ptr<AsyncWork>> priorWork;
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
 
   // Snapshot all in progress and pending work as weak_ptr.
   // When executing a barrier, we need to ensure that all prior work
@@ -2624,7 +2625,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier(
 
   auto tag = nextTag();
   auto context = getContext(tag);
-  auto work = std::make_shared<AsyncBarrierWork>(
+  auto work = c10::make_intrusive<AsyncBarrierWork>(
       std::move(context), std::move(priorWork), tag);
   enqueue(work);
   return work;
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index 31664ad0b6cf..0508b6f857a1 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -70,7 +70,7 @@ class ProcessGroupGloo : public ProcessGroup {
    public:
     AsyncWork(const char* profilingTitle = nullptr):  ProcessGroup::Work(-1, OpType::UNKNOWN, profilingTitle) {}
 
-    static void execute(std::shared_ptr<AsyncWork> work) {
+    static void execute(c10::intrusive_ptr<AsyncWork> work) {
       std::exception_ptr eptr;
       try {
         work->run();
@@ -152,82 +152,82 @@ class ProcessGroupGloo : public ProcessGroup {
   static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
 
   explicit ProcessGroupGloo(
-      const std::shared_ptr<Store>& store,
+      const c10::intrusive_ptr<Store>& store,
       int rank,
       int size,
       Options options = Options());
 
   virtual ~ProcessGroupGloo();
 
-  std::shared_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& output_lists,
       std::vector<at::Tensor>& input_list,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const GatherOptions& opts = GatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputCounts,
       std::vector<int64_t>& inputCounts,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
  protected:
@@ -258,7 +258,7 @@ class ProcessGroupGloo : public ProcessGroup {
   void runLoop(int workerIndex);
 
   // Queue work to run on worker thread.
-  void enqueue(std::shared_ptr<AsyncWork> work);
+  void enqueue(c10::intrusive_ptr<AsyncWork> work);
 
   // Keep both a queue of pending work, and a vector with in progress work.
   // Both of these can only be mutated when holding the queue lock.
@@ -266,8 +266,8 @@ class ProcessGroupGloo : public ProcessGroup {
   // to all in progress and pending work when executing a barrier.
   // When executing a barrier, we need to ensure that all prior work
   // has completed before completing itself.
-  std::deque<std::shared_ptr<AsyncWork>> workQueue_;
-  std::vector<std::shared_ptr<AsyncWork>> workInProgress_;
+  std::deque<c10::intrusive_ptr<AsyncWork>> workQueue_;
+  std::vector<c10::intrusive_ptr<AsyncWork>> workInProgress_;
   std::mutex workMutex_;
   std::condition_variable workProduceCV_;
   std::condition_variable workConsumeCV_;
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index d3e79a1dd424..5f9d0be41b8f 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -308,9 +308,9 @@ void ProcessGroupMPI::runLoop() {
   }
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
     std::unique_ptr<WorkEntry> entry) {
-  auto work = std::make_shared<WorkMPI>();
+  auto work = c10::make_intrusive<WorkMPI>();
   std::unique_lock<std::mutex> lock(pgMutex_);
   queue_.push_back(std::make_tuple(std::move(entry), work));
   lock.unlock();
@@ -318,7 +318,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::enqueue(
   return work;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   checkSingleTensor(tensors);
@@ -339,7 +339,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::broadcast(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   checkSingleTensor(tensors);
@@ -362,14 +362,14 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   throw std::runtime_error(
       "allreduce_coalesced is currently not supported with MPI");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   checkSingleTensor(tensors);
@@ -397,7 +397,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -441,7 +441,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllgatherOptions& /* unused */) {
@@ -449,7 +449,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_coalesced(
       "ProcessGroupMPI does not support allgather_coalesced");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const GatherOptions& opts) {
@@ -516,7 +516,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::gather(
   }
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ScatterOptions& opts) {
@@ -582,14 +582,14 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
   }
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce_scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
   throw std::runtime_error("ProcessGroupMPI does not support reduce_scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -665,7 +665,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall_base(
     return enqueue(std::move(entry));
   }
 }
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllToAllOptions& opts) {
@@ -722,7 +722,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::alltoall(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int tag) {
@@ -744,10 +744,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
         &request));
   }
 
-  return std::make_shared<AsyncWork>(tensor, request);
+  return c10::make_intrusive<AsyncWork>(tensor, request);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int tag) {
@@ -769,10 +769,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
         &request));
   }
 
-  return std::make_shared<AsyncWork>(tensor, request);
+  return c10::make_intrusive<AsyncWork>(tensor, request);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
     std::vector<at::Tensor>& tensors,
     int tag) {
   checkSingleTensor(tensors);
@@ -793,10 +793,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
         &request));
   }
 
-  return std::make_shared<AsyncWork>(tensor, request);
+  return c10::make_intrusive<AsyncWork>(tensor, request);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier(
     const BarrierOptions& opts) {
   std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
       [this](std::unique_ptr<WorkEntry>& entry) {
@@ -808,7 +808,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier(
   return enqueue(std::move(entry));
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupMPI::allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index 342fe87001a0..48d95eada887 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -108,80 +108,80 @@ class ProcessGroupMPI : public ProcessGroup {
   // Abort the MPI program, needs to be called when exception is detected
   void abort();
 
-  std::shared_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& data,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputbuffer,
       at::Tensor& inputbuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag);
 
-  std::shared_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag);
 
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
       int tag);
 
-  std::shared_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
   // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized
@@ -190,13 +190,13 @@ class ProcessGroupMPI : public ProcessGroup {
 
  protected:
   using WorkType =
-      std::tuple<std::unique_ptr<WorkEntry>, std::shared_ptr<WorkMPI>>;
+      std::tuple<std::unique_ptr<WorkEntry>, c10::intrusive_ptr<WorkMPI>>;
   // Worker thread loop
   void runLoop();
   // Helper function that is called by the destructor
   void destroy();
 
-  std::shared_ptr<ProcessGroup::Work> enqueue(std::unique_ptr<WorkEntry> entry);
+  c10::intrusive_ptr<ProcessGroup::Work> enqueue(std::unique_ptr<WorkEntry> entry);
 
   bool stop_;
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index ba0b4b36c77d..acb81d0cad6d 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -437,7 +437,7 @@ bool ProcessGroupNCCL::WorkNCCL::timedOut() {
 }
 
 ProcessGroupNCCL::ProcessGroupNCCL(
-    const std::shared_ptr<Store>& store,
+    const c10::intrusive_ptr<Store>& store,
     int rank,
     int size,
     Options options)
@@ -984,12 +984,12 @@ std::vector<at::Tensor> flatten_for_scatter_gather(
 
 } // namespace
 
-std::shared_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
+c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
     std::vector<at::Device> devices,
     int rank,
     OpType opType,
     const char* profilingTitle) {
-  return std::make_shared<ProcessGroupNCCL::WorkNCCL>(devices, rank, opType, profilingTitle);
+  return c10::make_intrusive<ProcessGroupNCCL::WorkNCCL>(devices, rank, opType);
 }
 
 std::vector<at::Tensor> ProcessGroupNCCL::WorkNCCL::result() {
@@ -1012,7 +1012,7 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupNCCL::WorkNCCL::
 }
 
 void ProcessGroupNCCL::workEnqueue(
-    std::shared_ptr<ProcessGroupNCCL::WorkNCCL> work) {
+    c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work) {
   if (!terminateProcessGroup_.load()) {
     std::lock_guard<std::mutex> lock(workMetaListMutex_);
     // Avoid view tensors to be processed in cleanup thread.
@@ -1027,7 +1027,7 @@ ProcessGroupNCCL::Options::Options()
       isHighPriorityStream(false) {}
 
 template <typename Fn, typename PreProcess, typename PostProcess>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs,
     Fn fn,
@@ -1114,7 +1114,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensors,
     Fn fn,
     int peer,
@@ -1186,7 +1186,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
 }
 
 template <typename Fn>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs,
     Fn fn,
@@ -1203,7 +1203,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
 }
 
 template <typename Fn>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensor,
     Fn fn,
     int peer,
@@ -1217,7 +1217,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
       [](std::vector<at::cuda::CUDAStream>&) {});
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   check_gpu_tensors(tensors);
@@ -1242,14 +1242,14 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
       "nccl:all_reduce");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   throw std::runtime_error(
       "allreduce_coalesced is currently not supported with NCCL");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   check_gpu_tensors(tensors);
@@ -1274,7 +1274,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::broadcast(
       "nccl:broadcast");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   check_gpu_tensors(tensors);
@@ -1301,7 +1301,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce(
       "nccl:reduce");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
@@ -1346,7 +1346,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather(
       "nccl:all_gather");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllgatherOptions& /* unused */) {
@@ -1354,7 +1354,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_coalesced(
       "ProcessGroupNCCL does not support allgather_coalesced");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
@@ -1400,7 +1400,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
       "nccl:reduce_scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
     const BarrierOptions& opts) {
   std::vector<at::Device> devices;
   if (usedDeviceIdxs_.empty()) {
@@ -1441,7 +1441,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
 }
 
 #ifdef ENABLE_NCCL_P2P_SUPPORT
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -1512,7 +1512,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   }
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int /* unused */) {
@@ -1531,7 +1531,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
   return ret;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int /* unused */) {
@@ -1550,7 +1550,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
   return ret;
 }
 #else
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& /* unused */,
     at::Tensor& /* unused */,
     std::vector<int64_t>& /* unused */,
@@ -1560,7 +1560,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
@@ -1568,7 +1568,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
       "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
@@ -1591,34 +1591,34 @@ void ProcessGroupNCCL::groupEnd() {
   --ncclActiveGroupCounter_;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const AllToAllOptions& /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support alltoall");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::gather(
     std::vector<std::vector<at::Tensor>>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
     const GatherOptions& /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support gather");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
     std::vector<at::Tensor>& /* unused */,
     std::vector<std::vector<at::Tensor>>& /* unused */,
     const ScatterOptions& /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 1520604629f2..b93bd0c2d70c 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -65,7 +65,7 @@ constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING";
 class ProcessGroupNCCL : public ProcessGroup {
  public:
   class WorkNCCL : public ProcessGroup::Work,
-                   public std::enable_shared_from_this<WorkNCCL> {
+    public std::enable_shared_from_this<WorkNCCL> {
    public:
     // Constructor takes a list of CUDA devices
     WorkNCCL(const std::vector<at::Device>& devices, int rank, OpType opType, const char* profilingTitle = nullptr);
@@ -163,7 +163,7 @@ class ProcessGroupNCCL : public ProcessGroup {
 
     // Reference to the store so that we can write aborted communicators
     // to the store.
-    std::shared_ptr<Store> store_;
+    c10::intrusive_ptr<Store> store_;
 
     // Store a reference to NCCL collective's outputs to be used by getFuture.
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
@@ -393,7 +393,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   // communicator. These NCCL communicators are cached and reused if possible.
   //
   ProcessGroupNCCL(
-      const std::shared_ptr<Store>& store,
+      const c10::intrusive_ptr<Store>& store,
       int rank,
       int size,
       Options options = Options());
@@ -402,7 +402,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   // If you have existing code that uses the `groupName`, you can replace
   // it by specifying a `c10d::PrefixStore(groupName, store)` for store.
   C10_DEPRECATED ProcessGroupNCCL(
-      const std::shared_ptr<Store>& store,
+      const c10::intrusive_ptr<Store>& store,
       int rank,
       int size,
       const std::string& groupName,
@@ -411,64 +411,64 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   virtual ~ProcessGroupNCCL();
 
-  std::shared_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputbuffer,
       at::Tensor& inputbuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall(
       std::vector<at::Tensor>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
@@ -478,17 +478,17 @@ class ProcessGroupNCCL : public ProcessGroup {
   static void groupEnd();
 
   // Unsupported Ops
-  std::shared_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const GatherOptions& opts = GatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputTensors,
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
@@ -515,7 +515,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   virtual std::exception_ptr checkForNCCLErrors(
       const std::vector<std::shared_ptr<NCCLComm>>& ncclComms);
 
-  virtual std::shared_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
+  virtual c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
       std::vector<at::Device> devices,
       int rank,
       OpType opType,
@@ -529,14 +529,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   //                    ncclComm_t, at::cuda::CUDAStream&);
   //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
   template <typename Fn>
-  std::shared_ptr<ProcessGroup::Work> collective(
+  c10::intrusive_ptr<ProcessGroup::Work> collective(
       std::vector<at::Tensor>& input,
       std::vector<at::Tensor>& output,
       Fn fn,
       OpType opType,
       const char* profilingTitle = nullptr);
   template <typename Fn, typename PreProcess, typename PostProcess>
-  std::shared_ptr<ProcessGroup::Work> collective(
+  c10::intrusive_ptr<ProcessGroup::Work> collective(
       std::vector<at::Tensor>& input,
       std::vector<at::Tensor>& output,
       Fn fn,
@@ -549,13 +549,13 @@ class ProcessGroupNCCL : public ProcessGroup {
   // primitives. It is the same structure as the helper used for collective
   // communicaiton primitives.
   template <typename Fn>
-  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+  c10::intrusive_ptr<ProcessGroup::Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
       int peer,
       OpType opType);
   template <typename Fn, typename PreProcess, typename PostProcess>
-  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+  c10::intrusive_ptr<ProcessGroup::Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
       int peer,
@@ -594,7 +594,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   static const int64_t kWorkCleanupThreadSleepMillis;
 
   // The store is used to broadcast the NCCL unique ID of rank 0.
-  std::shared_ptr<Store> store_;
+  c10::intrusive_ptr<Store> store_;
 
   // The number of NCCL communicators that have been created during
   // the lifetime of this process group. This sequence number is
@@ -664,7 +664,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
 
   // Add Work Pointer to workVector
-  void workEnqueue(std::shared_ptr<ProcessGroupNCCL::WorkNCCL>);
+  void workEnqueue(c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>);
 
   // The CUDA steams used by NCCL kernels
   std::unordered_map<std::string, std::vector<at::cuda::CUDAStream>>
diff --git a/torch/lib/c10d/ProcessGroupRoundRobin.cpp b/torch/lib/c10d/ProcessGroupRoundRobin.cpp
index 032f63c320f5..c77188577a62 100644
--- a/torch/lib/c10d/ProcessGroupRoundRobin.cpp
+++ b/torch/lib/c10d/ProcessGroupRoundRobin.cpp
@@ -17,66 +17,66 @@ ProcessGroupRoundRobin::ProcessGroupRoundRobin(
 
 ProcessGroupRoundRobin::~ProcessGroupRoundRobin() {}
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::broadcast(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
   return next()->broadcast(tensors, opts);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allreduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
   return next()->allreduce(tensors, opts);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allreduce_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   return next()->allreduce_coalesced(tensors, opts);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
   return next()->reduce(tensors, opts);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
   return next()->allgather(outputs, inputs, opts);
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather_coalesced(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather_coalesced(
     std::vector<std::vector<at::Tensor>>& outputTensorLists,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
   return next()->allgather(outputTensorLists, inputTensors, opts);
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::gather(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::gather(
     std::vector<std::vector<at::Tensor>>& outputs,
     std::vector<at::Tensor>& inputs,
     const GatherOptions& opts) {
   return next()->gather(outputs, inputs, opts);
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ScatterOptions& opts) {
   return next()->scatter(outputs, inputs, opts);
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce_scatter(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ReduceScatterOptions& opts) {
   return next()->reduce_scatter(outputs, inputs, opts);
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::alltoall_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
@@ -86,27 +86,27 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::alltoall_base(
       outputTensor, inputTensor, outputSplitSizes, inputSplitSizes, opts);
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::send(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::send(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupRoundRobin does not support send");
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recv(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recv(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupRoundRobin does not support recv");
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recvAnysource(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupRoundRobin does not support recv");
 };
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::barrier(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::barrier(
     const BarrierOptions& /* unused */) {
   throw std::runtime_error("ProcessGroupRoundRobin does not support barrier");
 };
@@ -120,7 +120,7 @@ const std::shared_ptr<ProcessGroup>& ProcessGroupRoundRobin::next() {
   return processGroup;
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather_base(
+c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupRoundRobin::allgather_base(
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
diff --git a/torch/lib/c10d/ProcessGroupRoundRobin.hpp b/torch/lib/c10d/ProcessGroupRoundRobin.hpp
index bbbd0a1c756b..62d59ef18ce5 100644
--- a/torch/lib/c10d/ProcessGroupRoundRobin.hpp
+++ b/torch/lib/c10d/ProcessGroupRoundRobin.hpp
@@ -25,75 +25,75 @@ class ProcessGroupRoundRobin final : public ProcessGroup {
 
   ~ProcessGroupRoundRobin() override;
 
-  std::shared_ptr<ProcessGroup::Work> broadcast(
+  c10::intrusive_ptr<ProcessGroup::Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allreduce_coalesced(
       std::vector<at::Tensor>& tensors,
       const AllreduceCoalescedOptions& opts =
           AllreduceCoalescedOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce(
       std::vector<at::Tensor>& tensors,
       const ReduceOptions& opts = ReduceOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_base(
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> allgather_coalesced(
+  c10::intrusive_ptr<ProcessGroup::Work> allgather_coalesced(
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> gather(
+  c10::intrusive_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       const GatherOptions& opts = GatherOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+  c10::intrusive_ptr<ProcessGroup::Work> reduce_scatter(
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> alltoall_base(
+  c10::intrusive_ptr<ProcessGroup::Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputSplitSizes,
       std::vector<int64_t>& inputSplitSizes,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
+  c10::intrusive_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> recv(
+  c10::intrusive_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
       int srcRank,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+  c10::intrusive_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
 
-  std::shared_ptr<ProcessGroup::Work> barrier(
+  c10::intrusive_ptr<ProcessGroup::Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override;
 
  private:
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index e42bbf300e0b..f97e80013cdb 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -6,9 +6,11 @@
 #include <string>
 #include <vector>
 
+#include <torch/custom_class.h>
+
 namespace c10d {
 
-class Store {
+class Store : public torch::CustomClassHolder {
  public:
   static constexpr std::chrono::milliseconds kDefaultTimeout =
       std::chrono::seconds(300);
diff --git a/torch/lib/c10d/comm.cpp b/torch/lib/c10d/comm.cpp
index a8628e0c942e..5ef88f058aca 100644
--- a/torch/lib/c10d/comm.cpp
+++ b/torch/lib/c10d/comm.cpp
@@ -45,8 +45,10 @@ class BroadcastWork {
   // because c10d::ProcessGroup::broadcast takes a vector argument.
   std::vector<at::Tensor> flat_tensor_;
 
+ private:
+
   // The broadcast work that is kicked off upon construction.
-  std::shared_ptr<c10d::ProcessGroup::Work> work_;
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> work_;
 };
 
 } // namespace
diff --git a/torch/lib/c10d/example/allreduce.cpp b/torch/lib/c10d/example/allreduce.cpp
index 76d6a5588f7e..3de7447d092a 100644
--- a/torch/lib/c10d/example/allreduce.cpp
+++ b/torch/lib/c10d/example/allreduce.cpp
@@ -19,7 +19,7 @@ int main(int argc, char** argv) {
   }
 
   // Kick off work
-  std::vector<std::shared_ptr<ProcessGroup::Work>> pending;
+  std::vector<c10::intrusive_ptr<ProcessGroup::Work>> pending;
   for (auto i = 0; i < ntensors; i++) {
     std::vector<at::Tensor> tmp = {tensors[i]};
     pending.push_back(pg.allreduce(tmp));
diff --git a/torch/lib/c10d/frontend.hpp b/torch/lib/c10d/frontend.hpp
index 69705427b53c..3449ee30b5ef 100644
--- a/torch/lib/c10d/frontend.hpp
+++ b/torch/lib/c10d/frontend.hpp
@@ -35,7 +35,7 @@ class DistributedC10d {
       const std::chrono::milliseconds& timeout,
       int64_t world_size,
       int64_t rank,
-      std::shared_ptr<Store> store,
+      c10::intrusive_ptr<Store> store,
       const std::string& group_name);
 
   void destroyProcessGroup(std::shared_ptr<ProcessGroup> group);
@@ -202,7 +202,7 @@ class DistributedC10d {
   // need to use ProcessGroup or ProcesGroup* as key.
   std::unordered_map<
       std::shared_ptr<ProcessGroup>,
-      std::pair<std::string, std::shared_ptr<Store>>>
+      std::pair<std::string, c10::intrusive_ptr<Store>>>
       pg_map_;
 
   // Note, this is different mapping relationship than original Python
diff --git a/torch/lib/c10d/reducer.cpp b/torch/lib/c10d/reducer.cpp
index c05ce685bb7d..c5ee54a9ee8e 100644
--- a/torch/lib/c10d/reducer.cpp
+++ b/torch/lib/c10d/reducer.cpp
@@ -472,7 +472,7 @@ std::vector<std::vector<at::Tensor>> Reducer::get_bucket_tensors() const {
 }
 
 void Reducer::set_forward_pass_work_handle(
-    std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
+    c10::intrusive_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
     bool useStaticWorldSize) {
   std::lock_guard<std::mutex> lock(mutex_);
   forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle);
diff --git a/torch/lib/c10d/reducer.hpp b/torch/lib/c10d/reducer.hpp
index 4874f0dd8703..e0fe0004f88e 100644
--- a/torch/lib/c10d/reducer.hpp
+++ b/torch/lib/c10d/reducer.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include <c10d/comm.hpp>
+#include <c10/util/intrusive_ptr.h>
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/default_comm_hooks.hpp>
 #include <torch/csrc/autograd/function.h>
@@ -96,7 +97,7 @@ class Reducer {
   // Creates and sets ForwardPassWorkHandle given a ProcessGroup::Work and the
   // corresponding tensor being reduced.
   void set_forward_pass_work_handle(
-      std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
+      c10::intrusive_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
       bool useStaticWorldSize);
 
   // Retrieve on-device tensors used to track locally unused parameters. For
@@ -158,7 +159,7 @@ class Reducer {
   bool local_used_maps_reduced_;
 
   // Work handle for allreduce on local_used_maps_
-  std::shared_ptr<c10d::ProcessGroup::Work> local_used_work_;
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> local_used_work_;
 
   void verify_replicas_within_process();
 
@@ -282,7 +283,7 @@ class Reducer {
     size_t pending;
 
     // Keep work handle around when this set of buckets is being reduced.
-    std::shared_ptr<c10d::ProcessGroup::Work> work;
+    c10::intrusive_ptr<c10d::ProcessGroup::Work> work;
 
     // Keep future work handle around if DDP comm hook is registered.
     c10::intrusive_ptr<torch::jit::Future> future_work;
@@ -340,7 +341,7 @@ class Reducer {
   // A struct containing work handle and tensor for allreduce scheduled in
   // forward pass, if applicable.
   struct ForwardPassAllreduceWork {
-    std::shared_ptr<c10d::ProcessGroup::Work> workHandle;
+    c10::intrusive_ptr<c10d::ProcessGroup::Work> workHandle;
     at::Tensor resultTensor;
     // whether we should divide by the initial world_size or the no. of
     // remaining DDP ranks.
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
index cc8da6326091..ce75c78adce7 100644
--- a/torch/lib/c10d/test/FileStoreTest.cpp
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -41,7 +41,7 @@ std::string tmppath() {
 void testGetSet(std::string path, std::string prefix = "") {
   // Basic Set/Get on File Store
   {
-    auto fileStore = std::make_shared<c10d::FileStore>(path, 2);
+    auto fileStore = c10::make_intrusive<c10d::FileStore>(path, 2);
     c10d::PrefixStore store(prefix, fileStore);
     c10d::test::set(store, "key0", "value0");
     c10d::test::set(store, "key1", "value1");
@@ -53,7 +53,7 @@ void testGetSet(std::string path, std::string prefix = "") {
 
   // Perform get on new instance
   {
-    auto fileStore = std::make_shared<c10d::FileStore>(path, 2);
+    auto fileStore = c10::make_intrusive<c10d::FileStore>(path, 2);
     c10d::PrefixStore store(prefix, fileStore);
     c10d::test::check(store, "key0", "value0");
   }
@@ -69,7 +69,8 @@ void stressTestStore(std::string path, std::string prefix = "") {
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(std::thread([&] {
-      auto fileStore = std::make_shared<c10d::FileStore>(path, numThreads + 1);
+      auto fileStore =
+          c10::make_intrusive<c10d::FileStore>(path, numThreads + 1);
       c10d::PrefixStore store(prefix, fileStore);
       sem1.post();
       sem2.wait();
@@ -87,7 +88,7 @@ void stressTestStore(std::string path, std::string prefix = "") {
 
   // Check that the counter has the expected value
   {
-    auto fileStore = std::make_shared<c10d::FileStore>(path, numThreads + 1);
+    auto fileStore = c10::make_intrusive<c10d::FileStore>(path, numThreads + 1);
     c10d::PrefixStore store(prefix, fileStore);
     std::string expected = std::to_string(numThreads * numIterations);
     c10d::test::check(store, "counter", expected);
diff --git a/torch/lib/c10d/test/HashStoreTest.cpp b/torch/lib/c10d/test/HashStoreTest.cpp
index a16f83231a58..24b7fc76a417 100644
--- a/torch/lib/c10d/test/HashStoreTest.cpp
+++ b/torch/lib/c10d/test/HashStoreTest.cpp
@@ -11,7 +11,7 @@
 void testGetSet(std::string prefix = "") {
   // Basic set/get
   {
-    auto hashStore = std::make_shared<c10d::HashStore>();
+    auto hashStore = c10::make_intrusive<c10d::HashStore>();
     c10d::PrefixStore store(prefix, hashStore);
     c10d::test::set(store, "key0", "value0");
     c10d::test::set(store, "key1", "value1");
@@ -32,7 +32,7 @@ void testGetSet(std::string prefix = "") {
 
   // get() waits up to timeout_.
   {
-    auto hashStore = std::make_shared<c10d::HashStore>();
+    auto hashStore = c10::make_intrusive<c10d::HashStore>();
     c10d::PrefixStore store(prefix, hashStore);
     std::thread th([&]() { c10d::test::set(store, "key0", "value0"); });
     c10d::test::check(store, "key0", "value0");
@@ -47,7 +47,7 @@ void stressTestStore(std::string prefix = "") {
 
   std::vector<std::thread> threads;
   c10d::test::Semaphore sem1, sem2;
-  auto hashStore = std::make_shared<c10d::HashStore>();
+  auto hashStore = c10::make_intrusive<c10d::HashStore>();
   c10d::PrefixStore store(prefix, hashStore);
 
   for (auto i = 0; i < numThreads; i++) {
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
index 92dede9a573e..091ea9b2ad07 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -45,7 +45,7 @@ class AsyncTest {
   }
 
   void start(int rank, int size) {
-    auto store = std::make_shared<::c10d::FileStore>(path_, size);
+    auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
     // Use tiny timeout to make this test run fast
     ::c10d::ProcessGroupGloo::Options options;
@@ -93,7 +93,7 @@ class AsyncInputIsOutputTest : public AsyncTest {
     }
   }
 
-  void wait(std::shared_ptr<ProcessGroup::Work>& work) {
+  void wait(c10::intrusive_ptr<ProcessGroup::Work>& work) {
     at::cuda::CUDAMultiStreamGuard guard(streams_);
     work->wait();
   }
@@ -130,7 +130,7 @@ class AsyncAllreduceTest : public AsyncInputIsOutputTest {
   AsyncAllreduceTest(const std::string& path, int numTensors)
       : AsyncInputIsOutputTest(path, numTensors) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -156,7 +156,7 @@ class AsyncBroadcastTest : public AsyncInputIsOutputTest {
   AsyncBroadcastTest(const std::string& path, int numTensors)
       : AsyncInputIsOutputTest(path, numTensors) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -185,7 +185,7 @@ void runAsyncAllreduceTest(
     size_t numProcesses = 4,
     size_t numTensors = 2) {
   auto tests = initialize<AsyncAllreduceTest>(path, numProcesses, numTensors);
-  std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
+  std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
   for (size_t i = 0; i < numProcesses; i++) {
     work[i] = tests[i].run();
   }
@@ -229,7 +229,7 @@ void runAsyncBroadcastTest(
   // Try every permutation of root rank and root tensor
   for (size_t rootRank = 0; rootRank < numProcesses; rootRank++) {
     for (size_t rootTensor = 0; rootTensor < numTensors; rootTensor++) {
-      std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
+      std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
       for (size_t i = 0; i < numProcesses; i++) {
         work[i] = tests[i].run(rootRank, rootTensor);
       }
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index da4f9b5fc106..469cf32a8442 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -44,8 +44,8 @@ class SignalTest {
     });
   }
 
-  std::shared_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) {
-    auto store = std::make_shared<::c10d::FileStore>(path_, size);
+  c10::intrusive_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) {
+    auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
     ::c10d::ProcessGroupGloo::Options options;
     // Set a timeout that is small enough to make this test run fast, but also
@@ -62,7 +62,7 @@ class SignalTest {
     };
 
     // Loop until an exception happens
-    std::shared_ptr<::c10d::ProcessGroup::Work> work;
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work;
     while (true) {
       work = pg.allreduce(tensors);
       try {
@@ -82,7 +82,7 @@ class SignalTest {
   Semaphore sem_;
 };
 
-std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
+c10::intrusive_ptr<::c10d::ProcessGroup::Work> testSignal(
     const std::string& path,
     int signal) {
   Fork fork;
@@ -101,13 +101,13 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
 class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
  public:
   ProcessGroupGlooDelayed(
-      const std::shared_ptr<::c10d::Store>& store,
+      const c10::intrusive_ptr<::c10d::Store>& store,
       int rank,
       int size,
       Options options)
       : ProcessGroupGloo(store, rank, size, options) {}
 
-  std::shared_ptr<::c10d::ProcessGroup::Work> send(
+  c10::intrusive_ptr<::c10d::ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
       int tag) override {
@@ -151,7 +151,7 @@ class CollectiveTest {
   }
 
   void start(int rank, int size, bool delayed) {
-    auto store = std::make_shared<::c10d::FileStore>(path_, size);
+    auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
     // Set a timeout that is small enough to make this test run fast, but also
     // make sure that we don't get timeouts in the ProcessGroupGloo constructor.
@@ -200,7 +200,7 @@ void testAllreduce(const std::string& path, const at::DeviceType b) {
   }
 
   // Kick off work
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
   for (auto i = 0; i < size; i++) {
     work[i] = tests[i].getProcessGroup().allreduce(inputs[i]);
   }
@@ -250,7 +250,7 @@ void testBroadcast(const std::string& path, const at::DeviceType b) {
       options.rootTensor = j;
 
       // Kick off work
-      std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+      std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
       for (auto i = 0; i < size; i++) {
         work[i] = tests[i].getProcessGroup().broadcast(inputs[i], options);
       }
@@ -316,7 +316,7 @@ void testAlltoall(const std::string& path, const at::DeviceType b) {
   };
 
   // Kick off work
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
   for (auto rank = 0; rank < size; rank++) {
     work[rank] = tests[rank].getProcessGroup().alltoall_base(
         outputs[rank], inputs[rank], outputSplits[rank], inputSplits[rank]);
@@ -349,7 +349,7 @@ void testBarrier(const std::string& path) {
   auto tests = CollectiveTest::initialize(path, size);
 
   // Kick off work
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> work(size);
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> work(size);
   for (auto i = 0; i < size; i++) {
     work[i] = tests[i].getProcessGroup().barrier();
   }
diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
index 3f5a9e4cf331..6c60b3d6742d 100644
--- a/torch/lib/c10d/test/ProcessGroupMPITest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
@@ -14,7 +14,7 @@
 // Wait for work to complete
 void waitWork(
     std::shared_ptr<c10d::ProcessGroupMPI> pg,
-    std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> works) {
+    std::vector<c10::intrusive_ptr<c10d::ProcessGroup::Work>> works) {
   for (auto& work : works) {
     try {
       work->wait();
@@ -34,10 +34,11 @@ void testAllreduce(int iter = 1000) {
     allTensors[i] = std::vector<at::Tensor>({tensor});
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (auto& tensors : allTensors) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->allreduce(tensors);
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
+        pg->allreduce(tensors);
     works.push_back(std::move(work));
   }
 
@@ -73,10 +74,11 @@ void testBroadcast(int iter = 10000) {
     }
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (auto& tensors : allTensors) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->broadcast(tensors);
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
+        pg->broadcast(tensors);
     works.push_back(std::move(work));
   }
 
@@ -104,10 +106,10 @@ void testReduce(int iter = 10000) {
     allTensors[i] = std::vector<at::Tensor>({tensor});
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (auto& tensors : allTensors) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->reduce(tensors);
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->reduce(tensors);
     works.push_back(std::move(work));
   }
 
@@ -150,10 +152,10 @@ void testAllgather(int iter = 10000) {
     }
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (size_t i = 0; i < allTensors.size(); ++i) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work =
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
         pg->allgather(allOutputTensors[i], allTensors[i]);
     works.push_back(std::move(work));
   }
@@ -198,10 +200,10 @@ void testGather(int iter = 10000) {
     }
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (size_t i = 0; i < allTensors.size(); ++i) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work =
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
         pg->gather(allOutputTensors[i], allTensors[i]);
     works.push_back(std::move(work));
   }
@@ -249,10 +251,10 @@ void testScatter(int iter = 1) {
     }
   }
 
-  std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+  std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
   for (size_t i = 0; i < allTensors.size(); ++i) {
     // Kick off work
-    std::shared_ptr<::c10d::ProcessGroup::Work> work =
+    c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
         pg->scatter(allTensors[i], allInputTensors[i]);
     works.push_back(std::move(work));
   }
@@ -289,27 +291,27 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
   }
 
   if (rank == 0) {
-    std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+    std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
     for (auto& tensors : allTensors) {
       // Kick off work
-      std::shared_ptr<::c10d::ProcessGroup::Work> work =
+      c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
           pg->send(tensors, 1, 0);
       works.push_back(std::move(work));
     }
     waitWork(pg, works);
   }
   if (rank == 1) {
-    std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
+    std::vector<c10::intrusive_ptr<::c10d::ProcessGroup::Work>> works;
     std::vector<int> srcRanks(allTensors.size(), -1);
     size_t i = 0;
     for (auto& tensors : allTensors) {
       // Kick off work
       if (!recvAnysource) {
-        std::shared_ptr<::c10d::ProcessGroup::Work> work =
+        c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
             pg->recv(tensors, 0, 0);
         works.push_back(std::move(work));
       } else {
-        std::shared_ptr<::c10d::ProcessGroup::Work> work =
+        c10::intrusive_ptr<::c10d::ProcessGroup::Work> work =
             pg->recvAnysource(tensors, 0);
         works.push_back(std::move(work));
       }
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
index e906702a889d..e19981c523de 100644
--- a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
@@ -37,7 +37,7 @@ class WorkNCCLSimulateErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
 class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
  public:
   ProcessGroupNCCLSimulateErrors(
-      const std::shared_ptr<c10d::Store>& store,
+      const c10::intrusive_ptr<c10d::Store>& store,
       int rank,
       int size,
       c10d::ProcessGroupNCCL::Options opts)
@@ -56,12 +56,12 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
         ProcessGroupNCCLSimulateErrors::kWatchdogThreadSleepMillis);
   }
 
-  std::shared_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
+  c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
       std::vector<at::Device> devices,
       int rank,
       c10d::OpType opType,
       const char* profilingTitle) override {
-    return std::make_shared<WorkNCCLSimulateErrors>(
+    return c10::make_intrusive<WorkNCCLSimulateErrors>(
         devices, simulate_error_, rank, opType);
   }
 
@@ -106,19 +106,19 @@ class WorkNCCLTimedoutErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
 class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors {
  public:
   ProcessGroupNCCLTimedOutErrors(
-      const std::shared_ptr<c10d::Store>& store,
+      const c10::intrusive_ptr<c10d::Store>& store,
       int rank,
       int size,
       c10d::ProcessGroupNCCL::Options opts)
       : ProcessGroupNCCLSimulateErrors(store, rank, size, opts),
         set_timedout_error_(false) {}
 
-  std::shared_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
+  c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
       std::vector<at::Device> devices,
       int rank,
       c10d::OpType opType,
       const char* profilingTitle) override {
-    return std::make_shared<WorkNCCLTimedoutErrors>(
+    return c10::make_intrusive<WorkNCCLTimedoutErrors>(
         devices, set_timedout_error_, rank, opType);
   }
 
@@ -153,7 +153,7 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
   void SetUp() override {
     size_t numDevices = cudaNumDevices();
     TemporaryFile file;
-    store_ = std::make_shared<::c10d::FileStore>(file.path, 1);
+    store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1);
 
     at::cuda::OptionalCUDAGuard deviceGuard;
     tensors_.resize(numDevices);
@@ -168,7 +168,7 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
   }
 
   std::vector<at::Tensor> tensors_;
-  std::shared_ptr<::c10d::FileStore> store_;
+  c10::intrusive_ptr<::c10d::FileStore> store_;
 };
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
index 92b477fae7de..fa5e988273fc 100644
--- a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp
@@ -31,7 +31,7 @@ class NCCLTestBase {
   }
 
   void initialize(int rank, int size) {
-    auto store = std::make_shared<::c10d::FileStore>(path_, size);
+    auto store = c10::make_intrusive<::c10d::FileStore>(path_, size);
 
     pg_ = std::unique_ptr<::c10d::ProcessGroupNCCL>(
         new ::c10d::ProcessGroupNCCL(store, rank, size));
@@ -80,7 +80,7 @@ class NCCLTest : public NCCLTestBase {
   }
 
   void wait(
-      std::shared_ptr<ProcessGroup::Work>& work,
+      c10::intrusive_ptr<ProcessGroup::Work>& work,
       std::chrono::milliseconds timeout = kNoTimeout) {
     at::cuda::CUDAMultiStreamGuard guard(streams_);
     work->wait(timeout);
@@ -166,7 +166,7 @@ class AllreduceNCCLTest : public NCCLTest {
   AllreduceNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -189,7 +189,7 @@ class BroadcastNCCLTest : public NCCLTest {
   BroadcastNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -208,7 +208,7 @@ class ReduceNCCLTest : public NCCLTest {
   ReduceNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -227,7 +227,7 @@ class AllgatherNCCLTest : public NCCLTest {
   AllgatherNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
@@ -242,7 +242,7 @@ struct ReduceScatterNCCLTest : NCCLTest {
   ReduceScatterNCCLTest(const std::string& path, int worldSize)
       : NCCLTest(path, worldSize) {}
 
-  std::shared_ptr<c10d::ProcessGroup::Work> run() {
+  c10::intrusive_ptr<c10d::ProcessGroup::Work> run() {
     // For the duration of this function, make THC use our streams
     at::cuda::CUDAMultiStreamGuard guard(streams_);
 
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 0cfa72c7801a..8073ec0345e0 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -16,7 +16,7 @@ void testHelper(const std::string& prefix = "") {
   const auto numThreads = 16;
   const auto numWorkers = numThreads + 1;
 
-  auto serverTCPStore = std::make_shared<c10d::TCPStore>(
+  auto serverTCPStore = c10::make_intrusive<c10d::TCPStore>(
       "127.0.0.1",
       0,
       numWorkers,
@@ -25,7 +25,7 @@ void testHelper(const std::string& prefix = "") {
       /* wait */ false);
 
   auto serverStore =
-      std::make_unique<c10d::PrefixStore>(prefix, serverTCPStore);
+      c10::make_intrusive<c10d::PrefixStore>(prefix, serverTCPStore);
   // server store
   auto serverThread = std::thread([&serverStore, &serverTCPStore] {
     // Wait for all workers to join.
@@ -64,13 +64,13 @@ void testHelper(const std::string& prefix = "") {
   c10d::test::Semaphore sem1, sem2;
 
   // Each thread will have a client store to send/recv data
-  std::vector<std::shared_ptr<c10d::TCPStore>> clientTCPStores;
-  std::vector<std::unique_ptr<c10d::PrefixStore>> clientStores;
+  std::vector<c10::intrusive_ptr<c10d::TCPStore>> clientTCPStores;
+  std::vector<c10::intrusive_ptr<c10d::PrefixStore>> clientStores;
   for (auto i = 0; i < numThreads; i++) {
-    clientTCPStores.push_back(std::make_unique<c10d::TCPStore>(
+    clientTCPStores.push_back(c10::make_intrusive<c10d::TCPStore>(
         "127.0.0.1", serverTCPStore->getPort(), numWorkers, false));
-    clientStores.push_back(std::unique_ptr<c10d::PrefixStore>(
-        new c10d::PrefixStore(prefix, clientTCPStores[i])));
+    clientStores.push_back(
+        c10::make_intrusive<c10d::PrefixStore>(prefix, clientTCPStores[i]));
   }
 
   std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1);
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index b917300c624e..d06203cb4508 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -209,6 +209,44 @@ def matmul(g, self, other):
 
 @parse_args('v', 'v', 'v', 't', 't')
 def addmm(g, self, mat1, mat2, beta, alpha):
+    dtype = None
+    self_dtype = self.type().scalarType()
+    mat1_dtype = mat1.type().scalarType()
+    mat2_dtype = mat2.type().scalarType()
+    if self_dtype is not None:
+        dtype = self_dtype
+    elif mat1_dtype is not None:
+        dtype = mat1_dtype
+    elif mat2_dtype is not None:
+        dtype = mat2_dtype
+
+    mat1_rank = mat1.type().dim()
+    mat2_rank = mat2.type().dim()
+
+    def isNotNoneAnd(v, u):
+        return v is not None and v != u
+
+    if dtype is not None and (isNotNoneAnd(mat1_rank, 2) or isNotNoneAnd(mat2_rank, 2)):
+        dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
+        dtype = sym_help.scalar_type_to_pytorch_type[dtype]
+
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = sym_help._scalar(alpha)
+        beta = sym_help._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op("Constant",
+                         value_t=torch.tensor(alpha, dtype=dtype))
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op("Constant",
+                        value_t=torch.tensor(sym_help._scalar(beta), dtype=dtype))
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
     return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
 
 
@@ -1110,7 +1148,8 @@ def log_softmax(g, input, dim, dtype=None):
         dim = input_dim - 1
     return_op = g.op("LogSoftmax", input, axis_i=dim)
     if dtype and dtype.node().kind() != 'prim::Constant':
-        return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[dtype])
+        parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+        return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
     if is_transpose_required:
         return_op = g.op("Transpose", return_op, perm_i=axes)
     return return_op
@@ -1645,10 +1684,22 @@ def new_full(g, self, size, fill_value, dtype, layout, device, pin_memory=False)
     return full(g, size, fill_value, dtype, layout, device, pin_memory)
 
 
-def eye(g, n, m, dtype=None, layout=None, device=None, pin_memory=False):
-    shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0)
-    tensor = zeros(g, shape, dtype, layout, device)
-    return g.op("EyeLike", tensor)
+def eye(g, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, pin_memory = args
+        dim_size = g.op("Unsqueeze", n, axes_i=[0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    elif len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, pin_memory = args
+        shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    else:
+        raise NotImplementedError("Unknown aten::eye signature")
 
 
 def slice(g, self, *args):
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 35e3e1ac8efb..b87612c97dbe 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -347,9 +347,10 @@ def _prepare(self, model, qconfig_dict, prepare_custom_config_dict, is_standalon
 
         # match the patterns that will get quantized
         standalone_module_names = prepare_custom_config_dict.get("standalone_module_name", None)
+        standalone_module_classes = prepare_custom_config_dict.get("standalone_module_class", None)
         custom_module_classes = get_custom_module_class_keys(prepare_custom_config_dict, "float_to_observed_custom_module_class")
         matches = self._find_matches(
-            model.graph, self.modules, self.patterns, standalone_module_names, custom_module_classes)
+            model.graph, self.modules, self.patterns, standalone_module_names, standalone_module_classes, custom_module_classes)
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
@@ -826,7 +827,9 @@ def convert(self, model, debug=False, convert_custom_config_dict=None, is_standa
 
     def _find_matches(
             self, graph, modules, patterns,
-            standalone_module_names=None, custom_module_classes=None):
+            standalone_module_names=None,
+            standalone_module_classes=None,
+            custom_module_classes=None):
         """
         Matches the nodes in the input graph to quantization patterns, and
         outputs the information needed to quantize them in future steps.
@@ -850,6 +853,12 @@ def _find_matches(
         if custom_module_classes is None:
             custom_module_classes = []
 
+        if standalone_module_classes is None:
+            standalone_module_classes = []
+
+        if standalone_module_names is None:
+            standalone_module_names = []
+
         match_map = {}
         all_matched = set()
 
@@ -883,10 +892,9 @@ def record_match(pattern, node, matched):
                 match_map[node.name] = (
                     node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
 
-        def is_standalone_module(module_path):
-            if standalone_module_names is None:
-                return False
-            return module_path in standalone_module_names
+        def is_standalone_module(node_target):
+            return node_target in standalone_module_names or \
+                type(self.modules[node_target]) in standalone_module_classes
 
         # add standalone modules to the match
         for node in graph.nodes:
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 4dac8fb68429..fbd8168393c8 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -258,9 +258,9 @@ def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tu
         min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
         max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
 
-        scale = torch.ones(min_val_neg.size(), dtype=torch.float32)
-        zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64)
-        device = 'cuda' if min_val_neg.is_cuda else 'cpu'
+        device = min_val_neg.device
+        scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device)
+        zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
 
         if self.qscheme == torch.per_tensor_symmetric or self.qscheme == torch.per_channel_symmetric:
             max_val_pos = torch.max(-min_val_neg, max_val_pos)
@@ -297,7 +297,6 @@ def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tu
             if self.qscheme == torch.per_channel_affine_float_qparams:
                 zero_point = torch.tensor([float(zero_point)], dtype=zero_point.dtype, device=device)
 
-
         return scale, zero_point
 
 
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 93043559bf48..91d58c2966a4 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -75,6 +75,9 @@ def _prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None, is_standal
         # standalone module and custom module config are applied in top level module
         standalone_module_names = prepare_custom_config_dict.get('standalone_module_name', [])
         skipped_module_names += standalone_module_names
+
+        standalone_module_classes = prepare_custom_config_dict.get('standalone_module_class', [])
+        skipped_module_classes += standalone_module_classes
         float_custom_module_classes = get_custom_module_class_keys(
             prepare_custom_config_dict, "float_to_observed_custom_module_class")
         skipped_module_classes += float_custom_module_classes
@@ -170,6 +173,11 @@ def prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None):
         "standalone_module_name": [
            "submodule.standalone"
         ],
+
+        "standalone_module_class": [
+            StandaloneModule
+        ],
+
         # user will manually define the corresponding observed
         # module class which has a from_float class method that converts
         # float custom module to observed custom module
diff --git a/torch/tensor.py b/torch/tensor.py
index 64e7d9ee44c0..b3cb2890fde9 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -1040,7 +1040,8 @@ def _convert(ret, cls):
     if isinstance(ret, Tensor):
         ret = ret.as_subclass(cls)
 
-    if isinstance(ret, tuple):
-        ret = tuple(_convert(r, cls) for r in ret)
+    if isinstance(ret, (tuple, list)):
+        # Also handles things like namedtuples
+        ret = type(ret)(_convert(r, cls) for r in ret)
 
     return ret
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 80120b019a99..c26556f4d70a 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -212,7 +212,8 @@ def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg=
     if not isinstance(expected, torch.Tensor):
         expected = torch.tensor(expected, dtype=actual.dtype)
     if expected.shape != actual.shape:
-        expected = expected.expand_as(actual)
+        raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
+                             "shape {1}!".format(expected.shape, actual.shape))
     if rtol is None or atol is None:
         if rtol is not None or atol is not None:
             raise ValueError("rtol and atol must both be specified or both be unspecified")
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index c61f6e709afe..c409b5265a67 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -14,7 +14,7 @@
      floating_types, floating_types_and, floating_and_complex_types,
      floating_and_complex_types_and, all_types_and_complex_and, all_types_and)
 from torch.testing._internal.common_device_type import \
-    (skipCUDAIfNoMagma, skipCPUIfNoLapack, expectedFailureCUDA,
+    (skipCUDAIfNoMagma, skipCPUIfNoLapack,
      expectedAlertNondeterministic, precisionOverride)
 from torch.testing._internal.common_utils import \
     (prod_single_zero, random_square_matrix_of_rank,
@@ -867,11 +867,9 @@ def method_tests():
         ('kthvalue', (S, S, S), (2, 1, True,), 'keepdim_dim', (), [1]),
         ('kthvalue', (S,), (2, 0,), 'dim_1d', (), [1]),
         ('kthvalue', (S,), (2, 0, True,), 'keepdim_dim_1d', (), [1]),
-        # TODO: https://github.com/pytorch/pytorch/issues/30818
-        ('kthvalue', (), (1,), 'scalar', (), (), [expectedFailureCUDA]),
-        ('kthvalue', (), (1, 0,), 'scalar_dim', (), [1], [expectedFailureCUDA]),
-        ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1], [expectedFailureCUDA]),
-        # END TODO
+        ('kthvalue', (), (1,), 'scalar', (), ()),
+        ('kthvalue', (), (1, 0,), 'scalar_dim', (), [1]),
+        ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1]),
         ('quantile', (S, S, S), (0.5,)),
         ('quantile', (S, S, S), (0.5, 0), 'dim', (), [1]),
         ('quantile', (S, S, S), (0.5, None, True), 'keepdim'),
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 9ce1c58cb4da..21fd1a2a88e4 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4111,3 +4111,145 @@ def forward(_self, input, expected_type):  # noqa
 
             inp = TestNamedTupleInput_1(a, b)
             model(inp, type(inp))
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_control_flow_same_across_ranks(self):
+            # Control flow that is the same across ranks.
+            batch = 20
+            dim = 10
+
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super(ToyModel, self).__init__()
+                    self.lin1 = nn.Linear(10, 10, bias=False)
+                    self.lin2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    # Second layer is used dependent on input x.
+                    use_second_layer = torch.equal(
+                        x, torch.ones(batch, dim, device=x.device)
+                    )
+                    if use_second_layer:
+                        return self.lin2(F.relu(self.lin1(x)))
+                    else:
+                        return F.relu(self.lin1(x))
+
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel().cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            random_input = torch.randn(batch, dim, device=self.rank)
+            ones_input = torch.ones(batch, dim, device=self.rank)
+            for i in range(6):
+                if i % 2 == 0:
+                    out = model(random_input)
+                else:
+                    out = model(ones_input)
+                loss = out.sum()
+                loss.backward()
+                # On even iterations, 2nd param goes unused, on odd iterations,
+                # it is used.
+                local_used_maps = model.reducer._get_local_used_maps()
+                if i % 2 == 0:
+                    expected = torch.tensor([world_size, 0], device=self.rank, dtype=torch.int32)
+                else:
+                    expected = torch.tensor([world_size, world_size], device=self.rank, dtype=torch.int32)
+
+                # Validate parameter usage.
+                variable_usage_tensor = local_used_maps[0]
+                self.assertEqual(variable_usage_tensor, expected)
+
+            # Validate appropriate error message when DDP is used with
+            # find_unused_parameters=False.
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel().cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=False,
+            )
+            for i in range(2):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to have finished reduction in the prior iteration before starting a new one",
+                ) if i == 1 else suppress():
+                    loss = model(random_input).sum()
+                    loss.backward()
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_control_flow_different_across_ranks(self):
+            # Control flow that is different across ranks.
+            batch = 20
+            dim = 10
+
+            class ToyModel(nn.Module):
+                def __init__(self, rank):
+                    super(ToyModel, self).__init__()
+                    self.lin1 = nn.Linear(10, 10, bias=False)
+                    self.lin2 = nn.Linear(10, 10, bias=False)
+                    self.rank = rank
+
+                def forward(self, x):
+                    # Control-flow that is rank and input dependent for the
+                    # model.
+                    use_second_layer = (
+                        torch.equal(x, torch.ones(batch, dim, device=x.device))
+                        and self.rank == 1
+                    )
+
+                    if use_second_layer:
+                        return self.lin2(F.relu(self.lin1(x)))
+                    else:
+                        return F.relu(self.lin1(x))
+
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel(self.rank).cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            random_input = torch.randn(batch, dim, device=self.rank)
+            ones_input = torch.ones(batch, dim, device=self.rank)
+            for i in range(6):
+                if i % 2 == 0:
+                    out = model(random_input)
+                else:
+                    out = model(ones_input)
+                loss = out.sum()
+                loss.backward()
+                # On even iterations, 2nd param goes unused, on odd iterations,
+                # it is used only on rank 1.
+                local_used_maps = model.reducer._get_local_used_maps()
+
+                if i % 2 == 0:
+                    expected = torch.tensor([world_size, 0], device=self.rank, dtype=torch.int32)
+                else:
+                    expected = torch.tensor([world_size, 1], device=self.rank, dtype=torch.int32)
+
+                variable_usage_tensor = local_used_maps[0]
+                # Validate parameter usage. On odd iterations, 2nd param is only
+                # used on rank 1.
+                self.assertEqual(variable_usage_tensor, expected)
+
+            # Validate appropriate error message when DDP is used with
+            # find_unused_parameters=False.
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel(self.rank).cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=False,
+            )
+            for i in range(2):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to have finished reduction in the prior iteration before starting a new one",
+                ) if i == 1 else suppress():
+                    loss = model(random_input).sum()
+                    loss.backward()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index d0ff0500063a..59657e49f427 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -562,7 +562,7 @@ def spawn(cmd):
                         else:
                             cflags = []
 
-                        cflags = win_cuda_flags(cflags)
+                        cflags = win_cuda_flags(cflags) + ['--use-local-env']
                         for flag in COMMON_MSVC_FLAGS:
                             cflags = ['-Xcompiler', flag] + cflags
                         for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
@@ -632,7 +632,7 @@ def win_wrap_ninja_compile(sources,
             cuda_post_cflags = None
             cuda_cflags = None
             if with_cuda:
-                cuda_cflags = []
+                cuda_cflags = ['--use-local-env']
                 for common_cflag in common_cflags:
                     cuda_cflags.append('-Xcompiler')
                     cuda_cflags.append(common_cflag)
@@ -1429,9 +1429,17 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     # See cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
     _arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
 
-    # If not given, determine what's needed for the GPU that can be found
+    # If not given, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list:
         capability = torch.cuda.get_device_capability()
+        supported_sm = [int(arch.split('_')[1])
+                        for arch in torch.cuda.get_arch_list() if 'sm_' in arch]
+        max_supported_sm = max((sm // 10, sm % 10) for sm in supported_sm)
+        # Capability of the device may be higher than what's supported by the user's
+        # NVCC, causing compilation error. User's NVCC is expected to match the one
+        # used to build pytorch, so we use the maximum supported capability of pytorch
+        # to clamp the capability.
+        capability = min(max_supported_sm, capability)
         arch_list = [f'{capability[0]}.{capability[1]}']
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 0e2498d64c56..9e55ebff48b9 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -68,6 +68,7 @@ def persistent_load(self, pid):
     def dump(cls, in_stream, out_stream):
         value = cls(in_stream).load()
         pprint.pprint(value, stream=out_stream)
+        return value
 
 
 def main(argv, output_stream=None):