diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 58fbbd08f994..21b6eebef5a1 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
     )),
     # Skip CUDA-9.2 builds on Windows
     windows=(
-        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
+        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
         OrderedDict(
             wheel=dimensions.STANDARD_PYTHON_VERSIONS,
             conda=dimensions.STANDARD_PYTHON_VERSIONS,
@@ -142,11 +142,11 @@ def get_children(self):
 
         # XXX disabling conda rocm build since docker images are not there
         if self.find_prop("package_format") == 'conda':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         # XXX libtorch rocm build  is temporarily disabled
         if self.find_prop("package_format") == 'libtorch':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         return [ArchConfigNode(self, v) for v in gpu_versions]
 
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index 93d4d645a53a..1f83cd61b13c 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -9,9 +9,12 @@
 
 ROCM_VERSIONS = [
     "3.7",
+    "3.8",
 ]
 
-GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
+ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
+
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
 
 STANDARD_PYTHON_VERSIONS = [
     "3.6",
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index d582348b00c8..ccd97a053516 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -6,7 +6,7 @@
 import cimodel.lib.conf_tree as conf_tree
 import cimodel.lib.miniutils as miniutils
 from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
-from cimodel.data.simple.util.branch_filters import gen_filter_dict
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 from cimodel.data.simple.util.docker_constants import gen_docker_image
 
 
@@ -110,6 +110,8 @@ def gen_workflow_params(self, phase):
             parameters["resource_class"] = resource_class
         if phase == "build" and self.rocm_version is not None:
             parameters["resource_class"] = "xlarge"
+        if hasattr(self, 'filters'):
+            parameters['filters'] = self.filters
         return parameters
 
     def gen_workflow_job(self, phase):
@@ -139,14 +141,16 @@ def gen_workflow_job(self, phase):
 
 # TODO This is a hack to special case some configs just for the workflow list
 class HiddenConf(object):
-    def __init__(self, name, parent_build=None):
+    def __init__(self, name, parent_build=None, filters=None):
         self.name = name
         self.parent_build = parent_build
+        self.filters = filters
 
     def gen_workflow_job(self, phase):
         return {
             self.gen_build_name(phase): {
-                "requires": [self.parent_build.gen_build_name("build")]
+                "requires": [self.parent_build.gen_build_name("build")],
+                "filters": self.filters,
             }
         }
 
@@ -166,7 +170,8 @@ def gen_workflow_job(self, phase):
                 "branch": self.branch,
                 "requires": [self.parent_build],
                 "context": "org-member",
-                "filters": gen_filter_dict(branches_list=["nightly"])
+                "filters": gen_filter_dict(branches_list=["nightly"],
+                                           tags_list=RC_PATTERN)
             }
         }
 
@@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_python_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_cpp_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -348,6 +357,8 @@ def instantiate_configs():
 
         # run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds
         # should run on a CPU-only build that runs on all PRs.
+        # XXX should this be updated to a more modern build? Projects are
+        #     beginning to drop python3.6
         if (
             distro_name == "xenial"
             and fc.find_prop("pyver") == "3.6"
@@ -358,6 +369,8 @@ def instantiate_configs():
             and compiler_name == "gcc"
             and fc.find_prop("compiler_version") == "5.4"
         ):
+            c.filters = gen_filter_dict(branches_list=r"/.*/",
+                                        tags_list=RC_PATTERN)
             c.dependent_tests = gen_docs_configs(c)
 
         if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index 59944d190383..2b3add33b9a8 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 
 from cimodel.lib.miniutils import quote
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 
 
 # TODO: make this generated from a matrix rather than just a static list
@@ -24,25 +25,30 @@
     "pytorch-linux-xenial-py3.8",
     "pytorch-linux-xenial-py3.6-clang7",
     "pytorch-linux-xenial-py3.6-gcc4.8",
-    "pytorch-linux-xenial-py3.6-gcc5.4",
+    "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
+    "pytorch-linux-bionic-rocm3.8-py3.6",
 ]
 
 
 def get_workflow_jobs():
     """Generates a list of docker image build definitions"""
-    return [
-        OrderedDict(
+    ret = []
+    for image_name in IMAGE_NAMES:
+        parameters = OrderedDict({
+            "name": quote(f"docker-{image_name}"),
+            "image_name": quote(image_name),
+        }) 
+        if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
+            # pushing documentation on tags requires CircleCI to also
+            # build all the dependencies on tags, including this docker image
+            parameters['filters'] = gen_filter_dict(branches_list=r"/.*/",
+                                                    tags_list=RC_PATTERN)
+        ret.append(OrderedDict(
             {
-                "docker_build_job": OrderedDict(
-                    {
-                        "name": quote(f"docker-{image_name}"),
-                        "image_name": quote(image_name),
-                    }
-                )
+                "docker_build_job": parameters
             }
-        )
-        for image_name in IMAGE_NAMES
-    ]
+        ))
+    return ret
diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py
index 2f2dbf0027dc..235c08d62786 100644
--- a/.circleci/cimodel/data/simple/ge_config_tests.py
+++ b/.circleci/cimodel/data/simple/ge_config_tests.py
@@ -61,41 +61,25 @@ def gen_tree(self):
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_legacy", "test"],
+        ["jit_legacy", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
     GeConfigTestJob(
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_profiling", "test"],
-        ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
-    GeConfigTestJob(
-        MultiPartVersion([3, 6], "py"),
-        MultiPartVersion([5, 4], "gcc"),
-        None,
-        ["ge_config_simple", "test"],
+        ["jit_simple", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"],
     ),
     GeConfigTestJob(
         None,
         None,
         CudaVersion(10, 2),
-        ["cudnn7", "py3", "ge_config_legacy", "test"],
-        ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
-        use_cuda_docker=True,
-        # TODO Why does the build environment specify cuda10.1, while the
-        # job name is cuda10_2?
-        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"),
-    GeConfigTestJob(
-        None,
-        None,
-        CudaVersion(10, 2),
-        ["cudnn7", "py3", "ge_config_profiling", "test"],
+        ["cudnn7", "py3", "jit_legacy", "test"],
         ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
         use_cuda_docker=True,
         # TODO Why does the build environment specify cuda10.1, while the
         # job name is cuda10_2?
-        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"),
+        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"),
 ]
 
 
diff --git a/.circleci/cimodel/data/simple/ios_definitions.py b/.circleci/cimodel/data/simple/ios_definitions.py
index 4446fa24fc28..3473242bdf04 100644
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@@ -1,7 +1,7 @@
 from cimodel.data.simple.util.versions import MultiPartVersion
 
 
-IOS_VERSION = MultiPartVersion([11, 2, 1])
+IOS_VERSION = MultiPartVersion([12, 0, 0])
 
 
 class ArchVariant:
@@ -62,8 +62,8 @@ def gen_tree(self):
 
 WORKFLOW_DATA = [
     IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
-    # IOSJob(IOS_VERSION, ArchVariant("arm64")),
-    # IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
+    IOSJob(IOS_VERSION, ArchVariant("arm64")),
+    IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b32bb9b5086a..208e0d09eed0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -640,6 +640,7 @@ jobs:
           export CIRCLE_SHA1="$CIRCLE_SHA1"
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+          export CIRCLE_JOB="$CIRCLE_JOB"
           cd workspace
           python test/print_test_stats.py test
           EOL
@@ -924,7 +925,7 @@ jobs:
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run:
@@ -949,7 +950,7 @@ jobs:
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -990,7 +991,7 @@ jobs:
   binary_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -1017,7 +1018,7 @@ jobs:
   binary_ios_upload:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -1187,10 +1188,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1229,10 +1233,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1253,7 +1260,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -1287,7 +1294,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - attach_workspace:
@@ -1515,7 +1522,7 @@ jobs:
   pytorch_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_ios_build
@@ -1534,7 +1541,7 @@ jobs:
             rm cert.txt
             bundle exec fastlane install_cert
             # install the provisioning profile
-            PROFILE=TestApp_CI.mobileprovision
+            PROFILE=PyTorch_CI_2021.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
@@ -1592,7 +1599,7 @@ jobs:
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=TestApp_CI
+            PROFILE=PyTorch_CI_2021
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'
@@ -2130,6 +2137,39 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-rocm:3.7"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3429,6 +3469,51 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -4932,6 +5017,48 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: rocm3.7
+      - binary_upload:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -6311,6 +6438,11 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           image_name: "pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7.2"
           image_name: "pytorch-linux-xenial-py3.6-gcc7.2"
@@ -6320,12 +6452,20 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc5_4_build
           requires:
             - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_linux_test:
           name: pytorch_linux_xenial_py3_6_gcc5_4_test
           requires:
@@ -6333,7 +6473,17 @@ workflows:
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_python_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6343,10 +6493,17 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_python_doc_push
           requires:
             - pytorch_python_doc_build
       - pytorch_cpp_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6356,6 +6513,8 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_cpp_doc_push
           requires:
             - pytorch_cpp_doc_build
@@ -6819,10 +6978,23 @@ workflows:
             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build
             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build
       - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-x86_64_build
+          build_environment: pytorch-ios-12.0.0-x86_64_build
           ios_arch: x86_64
           ios_platform: SIMULATOR
-          name: pytorch_ios_11_2_1_x86_64_build
+          name: pytorch_ios_12_0_0_x86_64_build
+      - pytorch_ios_build:
+          build_environment: pytorch-ios-12.0.0-arm64_build
+          context: org-member
+          ios_arch: arm64
+          ios_platform: OS
+          name: pytorch_ios_12_0_0_arm64_build
+      - pytorch_ios_build:
+          build_environment: pytorch-ios-12.0.0-arm64_custom_build
+          context: org-member
+          ios_arch: arm64
+          ios_platform: OS
+          name: pytorch_ios_12_0_0_arm64_custom_build
+          op_list: mobilenetv2.yaml
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-mobile-build
           build_only: "1"
@@ -6851,38 +7023,23 @@ workflows:
           requires:
             - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_simple-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_simple_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test
-          docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc5_4_build
-          resource_class: large
-      - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test
-          requires:
-            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          resource_class: gpu.medium
-          use_cuda_docker_runtime: "1"
-      - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test
-          docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_jit_legacy_test
           requires:
             - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           resource_class: gpu.medium
@@ -7004,32 +7161,32 @@ workflows:
           requires:
             - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
       - binary_ios_build:
-          build_environment: libtorch-ios-11.2.1-nightly-x86_64-build
+          build_environment: libtorch-ios-12.0.0-nightly-x86_64-build
           context: org-member
           filters:
             branches:
               only: nightly
           ios_arch: x86_64
           ios_platform: SIMULATOR
-          name: pytorch_ios_11_2_1_nightly_x86_64_build
+          name: pytorch_ios_12_0_0_nightly_x86_64_build
       - binary_ios_build:
-          build_environment: libtorch-ios-11.2.1-nightly-arm64-build
+          build_environment: libtorch-ios-12.0.0-nightly-arm64-build
           context: org-member
           filters:
             branches:
               only: nightly
           ios_arch: arm64
           ios_platform: OS
-          name: pytorch_ios_11_2_1_nightly_arm64_build
+          name: pytorch_ios_12_0_0_nightly_arm64_build
       - binary_ios_upload:
-          build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload
+          build_environment: libtorch-ios-12.0.0-nightly-binary-build-upload
           context: org-member
           filters:
             branches:
               only: nightly
           requires:
-            - pytorch_ios_11_2_1_nightly_x86_64_build
-            - pytorch_ios_11_2_1_nightly_arm64_build
+            - pytorch_ios_12_0_0_nightly_x86_64_build
+            - pytorch_ios_12_0_0_nightly_arm64_build
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
@@ -7464,6 +7621,42 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly
           build_environment: "conda 3.6 cpu devtoolset7"
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 9bfa0b195499..0afc1b33c59e 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -262,6 +262,13 @@ case "$image" in
     VISION=yes
     ROCM_VERSION=3.7
     ;;
+  pytorch-linux-bionic-rocm3.8-py3.6)
+    ANACONDA_PYTHON_VERSION=3.6
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=3.8
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index ac4e1f18f1ef..1fc49932fee5 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -118,7 +118,7 @@ esac
 
 # Install Valgrind separately since the apt-get version is too old.
 mkdir valgrind_build && cd valgrind_build
-VALGRIND_VERSION=3.15.0
+VALGRIND_VERSION=3.16.1
 if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
 then
   wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
@@ -131,4 +131,3 @@ sudo make install
 cd ../../
 rm -rf valgrind_build
 alias valgrind="/usr/local/bin/valgrind"
-
diff --git a/.circleci/docker/common/install_cache.sh b/.circleci/docker/common/install_cache.sh
index f1066519cd70..17931375b6f0 100644
--- a/.circleci/docker/common/install_cache.sh
+++ b/.circleci/docker/common/install_cache.sh
@@ -16,7 +16,7 @@ fi
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {
-  printf "#!/bin/sh\nexec sccache $(which $1) \$*" > "/opt/cache/bin/$1"
+  printf "#!/bin/sh\nexec sccache $(which $1) \"\$@\"" > "/opt/cache/bin/$1"
   chmod a+x "/opt/cache/bin/$1"
 }
 
@@ -57,8 +57,8 @@ if [ -n "$ROCM_VERSION" ]; then
     TOPDIR=$(dirname $OLDCOMP)
     WRAPPED="$TOPDIR/original/$COMPNAME"
     mv "$OLDCOMP" "$WRAPPED"
-    printf "#!/bin/sh\nexec sccache $WRAPPED \$*" > "$OLDCOMP"
-    chmod a+x "$1"
+    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP"
+    chmod a+x "$OLDCOMP"
   }
 
   if [[ -e "/opt/rocm/hcc/bin/hcc" ]]; then
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index efab1e5ded3a..1166b3a1bab7 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -16,6 +16,7 @@ source ~/anaconda/bin/activate
 
 # Install dependencies
 conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
+conda install -c conda-forge valgrind --yes
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 # sync submodules
diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh
index be281120016a..863b21724a5d 100644
--- a/.circleci/scripts/binary_ios_test.sh
+++ b/.circleci/scripts/binary_ios_test.sh
@@ -13,7 +13,7 @@ base64 --decode cert.txt -o Certificates.p12
 rm cert.txt
 bundle exec fastlane install_cert
 # install the provisioning profile
-PROFILE=TestApp_CI.mobileprovision
+PROFILE=PyTorch_CI_2021.mobileprovision
 PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
 mkdir -pv "${PROVISIONING_PROFILES}"
 cd "${PROVISIONING_PROFILES}"
@@ -25,5 +25,5 @@ if ! [ -x "$(command -v xcodebuild)" ]; then
     echo 'Error: xcodebuild is not installed.'
     exit 1
 fi 
-PROFILE=TestApp_CI
+PROFILE=PyTorch_CI_2021
 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID}
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index d4c31cefc7e5..ec7651823536 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -73,7 +73,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="1.7.0.dev$DATE"
+BASE_BUILD_VERSION="1.8.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@@ -130,7 +130,7 @@ if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
 fi
 
 export DATE="$DATE"
-export NIGHTLIES_DATE_PREAMBLE=1.7.0.dev
+export NIGHTLIES_DATE_PREAMBLE=1.8.0.dev
 export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION"
 export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER"
 export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION"
diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
index bd26e8b2b373..489dfefdbff1 100644
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -135,7 +135,7 @@
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run:
@@ -160,7 +160,7 @@
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -201,7 +201,7 @@
   binary_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -228,7 +228,7 @@
   binary_ios_upload:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 810f16922d5c..5c7c9bf0462c 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -43,10 +43,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -85,10 +88,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -109,7 +115,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -143,7 +149,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "12.0"
     steps:
       - checkout
       - attach_workspace:
@@ -371,7 +377,7 @@
   pytorch_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_ios_build
@@ -390,7 +396,7 @@
             rm cert.txt
             bundle exec fastlane install_cert
             # install the provisioning profile
-            PROFILE=TestApp_CI.mobileprovision
+            PROFILE=PyTorch_CI_2021.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
@@ -448,7 +454,7 @@
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=TestApp_CI
+            PROFILE=PyTorch_CI_2021
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 0f0dd76636b4..3bc7e5855a41 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -206,6 +206,7 @@ jobs:
           export CIRCLE_SHA1="$CIRCLE_SHA1"
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+          export CIRCLE_JOB="$CIRCLE_JOB"
           cd workspace
           python test/print_test_stats.py test
           EOL
diff --git a/.github/workflows/jit_triage.yml b/.github/workflows/jit_triage.yml
index af59d2160ec6..1fb967e8ffb8 100644
--- a/.github/workflows/jit_triage.yml
+++ b/.github/workflows/jit_triage.yml
@@ -19,7 +19,7 @@ jobs:
             // - io: A reference to the @actions/io package
 
             // Check if issue has a JIT label.
-            const kJitLabel = "jit";
+            const kJitLabel = "oncall: jit";
 
             issue = await github.issues.get({
               owner: context.issue.owner,
diff --git a/.github/workflows/quantization_triage.yml b/.github/workflows/quantization_triage.yml
new file mode 100644
index 000000000000..ac337a066873
--- /dev/null
+++ b/.github/workflows/quantization_triage.yml
@@ -0,0 +1,78 @@
+name: quantization-triage
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  welcome:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v2
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            // Arguments available:
+            // - github: A pre-authenticated octokit/rest.js client
+            // - context: An object containing the context of the workflow run
+            // - core: A reference to the @actions/core package
+            // - io: A reference to the @actions/io package
+
+            // Check if issue has a Quantization label.
+            const kQuantizationLabel = "oncall: quantization";
+
+            issue = await github.issues.get({
+              owner: context.issue.owner,
+              repo: context.issue.repo,
+              issue_number: context.issue.number,
+            })
+
+            const hasQuantizationLabel = issue.data.labels.filter(label => label.name == kQuantizationLabel).length > 0;
+
+            if (!hasQuantizationLabel) {
+              core.debug("Issue " + issue.data.title + " does not have Quantization label");
+              return;
+            }
+
+            // Get project column ID.
+            const kProjectName = "Quantization Triage";
+            const kColumnName = "Need Triage";
+
+            // Query all projects in the repository.
+            // TODO: Support pagination once there are > 30 projects.
+            const projects = await github.projects.listForRepo({
+              owner: context.issue.owner,
+              repo: context.issue.repo,
+            });
+
+            // Filter out unwanted projects and get the ID for the Quantization Triage project.
+            const filteredProjects = projects.data.filter(project => project.name == kProjectName);
+
+            if (filteredProjects.length != 1) {
+              core.setFailed("Unable to find a project named " + kProjectName);
+              return;
+            }
+
+            const projectId = filteredProjects[0].id;
+            // First, query all columns in the project.
+            // TODO: Support pagination once there are > 30 columns.
+            const columns = await github.projects.listColumns({
+              project_id: projectId,
+            });
+
+            // Filter out unwanted projects and get the ID for the Need triage column.
+            const filteredColumns = columns.data.filter(column => column.name == kColumnName);
+
+            if (filteredColumns.length != 1) {
+              core.setFailed("Unable to find a column named " + kColumnName);
+              return;
+            }
+
+            const columnId = filteredColumns[0].id;
+
+            // Create a project card for this new issue.
+            await github.projects.createCard({
+              column_id: columnId,
+              content_id: issue.data.id,
+              content_type: "Issue",
+            })
diff --git a/.gitmodules b/.gitmodules
index 509ab94f1cf4..d7a11cc22996 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -130,3 +130,7 @@
     ignore = dirty
     path = third_party/tensorpipe
     url = https://github.com/pytorch/tensorpipe.git
+[submodule "third_party/valgrind"]
+    ignore = dirty
+	path = third_party/valgrind
+	url = https://sourceware.org/git/valgrind.git
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 20a7310a91c1..58b3979f7829 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -163,7 +163,7 @@ pip install --user pytest-sugar
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # Check out torch/vision at Jun 11 2020 commit
   # This hash must match one in .jenkins/pytorch/test.sh
-  pip install -q --user git+https://github.com/pytorch/vision.git@c2e8a00885e68ae1200eb6440f540e181d9125de
+  pip install -q --user git+https://github.com/pytorch/vision.git@e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
   pip install -q --user ninja
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
@@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 682dd29b4cff..24d6f5676f7d 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -66,7 +66,7 @@ function get_bazel() {
   chmod +x tools/bazel
 }
 
-TORCHVISION_COMMIT=c2e8a00885e68ae1200eb6440f540e181d9125de
+TORCHVISION_COMMIT=e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
 
 function install_torchvision() {
   # Check out torch/vision at Jun 11 2020 commit
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 213750ba7280..8e71738f414e 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -63,7 +63,7 @@ test_python_all() {
   # Increase default limit on open file handles from 256 to 1024
   ulimit -n 1024
 
-  python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM"
+  python test/run_test.py --verbose --exclude-jit-executor --determine-from="$DETERMINE_FROM"
 
   assert_git_not_dirty
 }
diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py
index c91472876c33..81c7e0752328 100644
--- a/.jenkins/pytorch/print_sccache_log.py
+++ b/.jenkins/pytorch/print_sccache_log.py
@@ -6,6 +6,7 @@
     lines = f.readlines()
 
 for line in lines:
-    # Ignore errors from CPU instruction set testing
-    if 'src.c' not in line:
+    # Ignore errors from CPU instruction set or symbol existing testing
+    keywords = ['src.c', 'CheckSymbolExists.c']
+    if all([keyword not in line for keyword in keywords]):
         print(line)
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 7e85039a72d1..0e35364a2f5d 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -126,23 +126,18 @@ if ([ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]);
   file_diff_from_base "$DETERMINE_FROM"
 fi
 
-test_python_nn() {
-  time python test/run_test.py --include test_nn --verbose --determine-from="$DETERMINE_FROM"
-  assert_git_not_dirty
-}
-
-test_python_ge_config_profiling() {
-  time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+test_python_legacy_jit() {
+  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_ge_config_legacy() {
-  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard1() {
+  time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_all_except_nn_and_cpp_extensions() {
-  time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard2() {
+  time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -304,7 +299,7 @@ test_xla() {
   assert_git_not_dirty
 }
 
-# Do NOT run this test before any other tests, like test_python_nn, etc.
+# Do NOT run this test before any other tests, like test_python_shard1, etc.
 # Because this function uninstalls the torch built from branch, and install
 # nightly version.
 test_backward_compatibility() {
@@ -338,6 +333,8 @@ test_benchmarks() {
     pip_install --user "requests"
     BENCHMARK_DATA="benchmarks/.data"
     mkdir -p ${BENCHMARK_DATA}
+    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
+    python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
     python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling
@@ -379,19 +376,17 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   install_torchvision
   test_xla
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
-  test_python_ge_config_legacy
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then
-  test_python_ge_config_profiling
+elif [[ "${BUILD_ENVIRONMENT}" == *legacy_jit* || "${JOB_BASE_NAME}" == *legacy_jit* ]]; then
+  test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
-  test_python_nn
-  test_cpp_extensions
+  install_torchvision
+  test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
   install_torchvision
-  test_python_all_except_nn_and_cpp_extensions
+  test_python_shard2
   test_aten
   test_libtorch
   test_custom_script_ops
@@ -407,9 +402,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4
   test_cpp_extensions
 else
   install_torchvision
-  test_python_nn
-  test_python_all_except_nn_and_cpp_extensions
-  test_cpp_extensions
+  test_python_shard1
+  test_python_shard2
   test_aten
   test_vec256
   test_libtorch
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
index a66ef4b651c5..cf7255ce3789 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
 if "%REBUILD%"=="" (
   call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3
   call conda install -y -q -c conda-forge cmake
+  call conda install -y -q -c rdonnelly libuv
 )
+
+:: Get installed libuv path
+@echo off
+set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library
+@echo on
+echo libuv_ROOT=%libuv_ROOT%
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
index 4bfb5bc85e66..d76637dd0db7 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
@@ -1,3 +1,3 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
+cd test && python run_test.py --exclude-jit-executor --verbose --determine-from="%1" && cd ..
 if ERRORLEVEL 1 exit /b 1
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
similarity index 51%
rename from .jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat
rename to .jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
index e437833d8c62..a9168644f471 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
@@ -3,9 +3,7 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 pushd test
 
 echo Run jit_profiling tests
-python run_test.py --include test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1"
+python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1"
 if ERRORLEVEL 1 exit /b 1
 
 popd
-
-
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index 0b0159d04a50..abcd5756d747 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -48,7 +48,7 @@ run_tests() {
             $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
             $SCRIPT_HELPERS_DIR/test_libtorch.bat
             if [[ "${USE_CUDA}" == "1" ]]; then
-              $SCRIPT_HELPERS_DIR/test_python_jit_profiling.bat "$DETERMINE_FROM"
+              $SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM"
             fi
         elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
             $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
diff --git a/BUILD.bazel b/BUILD.bazel
index 016863ff0958..a8ea7988a242 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -55,6 +55,7 @@ cc_library(
         "c10/cuda/*.h",
         "c10/cuda/impl/*.h",
         "c10/macros/*.h",
+        "c10/mobile/*.h",
         "c10/util/*.h",
         "c10/util/*.hpp",
     ]),
@@ -71,6 +72,7 @@ cc_library(
     srcs = glob([
         "c10/core/*.cpp",
         "c10/core/impl/*.cpp",
+        "c10/mobile/*.cpp",
         "c10/util/*.cpp",
     ]) + if_cuda(
         glob([
@@ -721,6 +723,7 @@ torch_cuda_half_options = [
     "-DCUDA_HAS_FP16=1",
     "-D__CUDA_NO_HALF_OPERATORS__",
     "-D__CUDA_NO_HALF_CONVERSIONS__",
+    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
     "-D__CUDA_NO_HALF2_OPERATORS__",
 ]
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 826c187b602e..0d1225ab450e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,7 @@ endif()
 
 # For non-supported platforms, turn USE_DISTRIBUTED off by default.
 # It is not tested and likely won't work without additional changes.
-if(NOT LINUX)
+if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed")
   # On macOS, if USE_DISTRIBUTED is enabled (specified by the user),
   # then make Gloo build with the libuv transport.
@@ -226,6 +226,32 @@ option(USE_TBB "Use TBB" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 
+# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
+# On Windows platform, if user does not install libuv in build conda env and
+# does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
+if(WIN32)
+  set(USE_TENSORPIPE OFF)
+  message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
+
+  if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT})
+    find_library(
+      libuv_tmp_LIBRARY
+      NAMES uv libuv
+      HINTS $ENV{CONDA_PREFIX}\\Library
+      PATH_SUFFIXES lib
+      REQUIRED
+      NO_DEFAULT_PATH)
+    if(NOT EXISTS ${libuv_tmp_LIBRARY})
+      set(USE_DISTRIBUTED OFF)
+      set(USE_GLOO OFF)
+      message(
+        WARNING "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF.")
+    else()
+      set(ENV{libuv_ROOT} $ENV{CONDA_PREFIX}\\Library)
+    endif()
+  endif()
+endif()
+
 # Linux distributions do not want too many embedded sources, in that sense we
 # need to be able to build pytorch with an (almost) empty third_party
 # directory.
diff --git a/CODEOWNERS b/CODEOWNERS
index 77b8d2cbcb36..42aa83bb61bf 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -23,9 +23,9 @@
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
-/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
-/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
+/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
+/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
+/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 03ad14dd843e..a1b4096592a7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -118,11 +118,37 @@ For example:
 - modify your Python file `torch/__init__.py`
 - test functionality
 
-You do not need to repeatedly install after modifying Python files.
+You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall
+if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
 
 In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch`
 and `python setup.py clean`. Then you can install in `develop` mode again.
 
+### Tips and Debugging
+* A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/)
+with `brew install cmake` if you are developing on MacOS or Linux system.
+* Our `setup.py` requires Python >= 3.6
+* If you run into errors when running `python setup.py develop`, here are some debugging steps:
+  1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
+  your CMake works and can compile this simple Hello World program without errors.
+  2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
+  details along the way, which saves time the next time you build. If you're running into issues, you can always
+  `rm -rf build` from the toplevel `pytorch` directory and start over.
+  3. If you have made edits to the PyTorch repo, commit any change you'd like to keep and clean the repo with the
+  following commands (note that clean _really_ removes all untracked files and changes.):
+  ```bash
+  git submodule deinit -f .
+  git clean -xdf
+  python setup.py clean
+  git submodule update --init --recursive # very important to sync the submodules
+  python setup.py develop                 # then try running the command again
+  ```
+  4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
+  experiment with some environment variables, you can pass them into the command:
+  ```bash
+  ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
+  ```
+
 ## Nightly Checkout & Pull
 
 The `tools/nightly.py` script is provided to ease pure Python development of
@@ -489,8 +515,7 @@ only interested in a specific component.
 - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
   rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
   `make` if you don't have ninja installed).
-- Don't need Caffe2?  Pass `BUILD_CAFFE2_OPS=0` to disable build of
-  Caffe2 operators.
+- Don't need Caffe2?  Pass `BUILD_CAFFE2=0` to disable Caffe2 build.
 
 On the initial build, you can also speed things up with the environment
 variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
diff --git a/Dockerfile b/Dockerfile
index d5619e1a8011..3706aa38b461 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@
 #       For reference: 
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
 ARG BASE_IMAGE=ubuntu:18.04
-ARG PYTHON_VERSION=3.7
+ARG PYTHON_VERSION=3.8
 
 FROM ${BASE_IMAGE} as dev-base
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
@@ -44,14 +44,15 @@ WORKDIR /opt/pytorch
 COPY --from=conda /opt/conda /opt/conda
 COPY --from=submodule-update /opt/pytorch /opt/pytorch
 RUN --mount=type=cache,target=/opt/ccache \
-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
     python setup.py install
 
 FROM conda as conda-installs
 ARG INSTALL_CHANNEL=pytorch-nightly
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=10.1 && \
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \
     /opt/conda/bin/conda clean -ya
+RUN /opt/conda/bin/pip install torchelastic
 
 FROM ${BASE_IMAGE} as official
 LABEL com.nvidia.volumes.needed="nvidia_driver"
diff --git a/README.md b/README.md
index 6191cabcb685..c6c1138747a2 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,13 @@ On MacOS
 conda install pkg-config libuv
 ```
 
+On Windows
+```bash
+# Add these packages if torch.distributed is needed.
+# Distributed package support on Windows is a prototype feature and is subject to changes.
+conda install -c conda-forge libuv=1.39
+```
+
 #### Get the PyTorch Source
 ```bash
 git clone --recursive https://github.com/pytorch/pytorch
diff --git a/android/README.md b/android/README.md
index bf5fa02e6cf4..e67b2e6ec071 100644
--- a/android/README.md
+++ b/android/README.md
@@ -15,8 +15,8 @@ repositories {
 }
 
 dependencies {
-    implementation 'org.pytorch:pytorch_android:1.5.0'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.5.0'
+    implementation 'org.pytorch:pytorch_android:1.6.0'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
 }
 ```
 
@@ -34,12 +34,12 @@ repositories {
 
 dependencies {
     ...
-    implementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT'
     ...
 }
 ```
-The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.7.0-SNAPSHOT`.
+The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.8.0-SNAPSHOT`.
 
 ## Building PyTorch Android from Source
 
diff --git a/android/gradle.properties b/android/gradle.properties
index 6e0dc0ac86b0..0ab42c56396d 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=1.7.0-SNAPSHOT
+VERSION_NAME=1.8.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 POM_URL=https://github.com/pytorch/pytorch/tree/master/android
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
index 11696daf43a2..fed6170c2bf3 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@@ -18,6 +18,17 @@
 
 namespace pytorch_jni {
 
+c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode) {
+  if (deviceJniCode == kDeviceCPU) {
+    return at::kCPU;
+  } else if (deviceJniCode == kDeviceVulkan) {
+    return at::kVulkan;
+  }
+
+  facebook::jni::throwNewJavaException(
+      facebook::jni::gJavaLangIllegalArgumentException, "Unknown device");
+}
+
 bool Trace::is_initialized_ = false;
 
 #if defined(TRACE_ENABLED) && defined(__ANDROID__)
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
index fb974d4ad702..9b4e7e5f84a1 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <fbjni/fbjni.h>
 #include <torch/csrc/api/include/torch/types.h>
 
@@ -18,6 +20,11 @@
 
 namespace pytorch_jni {
 
+constexpr static int kDeviceCPU = 1;
+constexpr static int kDeviceVulkan = 2;
+
+c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode);
+
 class Trace {
  public:
 #if defined(TRACE_ENABLED) && defined(__ANDROID__)
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
index b05c19665f20..e4bb4c083160 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@@ -67,22 +67,25 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
  private:
   friend HybridBase;
   torch::jit::Module module_;
+  c10::DeviceType deviceType_;
 
  public:
   constexpr static auto kJavaDescriptor = "Lorg/pytorch/NativePeer;";
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
-      facebook::jni::alias_ref<jstring> modelPath) {
-    return makeCxxInstance(modelPath);
+      facebook::jni::alias_ref<jstring> modelPath,
+      jint device) {
+    return makeCxxInstance(modelPath, device);
   }
 
 #ifdef __ANDROID__
   static facebook::jni::local_ref<jhybriddata> initHybridAndroidAsset(
       facebook::jni::alias_ref<jclass>,
       facebook::jni::alias_ref<jstring> assetName,
-      facebook::jni::alias_ref<jobject> assetManager) {
-    return makeCxxInstance(assetName, assetManager);
+      facebook::jni::alias_ref<jobject> assetManager,
+      jint device) {
+    return makeCxxInstance(assetName, assetManager, device);
   }
 #endif
 
@@ -127,17 +130,19 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     ((void)once);
   }
 
-  PytorchJni(facebook::jni::alias_ref<jstring> modelPath) {
+  PytorchJni(facebook::jni::alias_ref<jstring> modelPath, jint device) {
     preModuleLoadSetup();
     JITCallGuard guard;
     module_ = torch::jit::load(std::move(modelPath->toStdString()));
     module_.eval();
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 
 #ifdef __ANDROID__
   PytorchJni(
       facebook::jni::alias_ref<jstring> assetName,
-      facebook::jni::alias_ref<jobject> assetManager) {
+      facebook::jni::alias_ref<jobject> assetManager,
+      jint device) {
     preModuleLoadSetup();
     JNIEnv* env = facebook::jni::Environment::current();
     AAssetManager* mgr = AAssetManager_fromJava(env, assetManager.get());
@@ -166,6 +171,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
         assetBuffer, AAsset_getLength(asset)));
     AAsset_close(asset);
     module_.eval();
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 #endif
 
@@ -191,7 +197,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     auto output = [&]() {
       JITCallGuard guard;
@@ -212,7 +225,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
index 061b85221fe9..8a96e395f267 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@@ -30,9 +30,6 @@ struct LiteJITCallGuard {
 } // namespace
 
 class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
-  constexpr static int kDeviceCPU = 1;
-  constexpr static int kDeviceVulkan = 2;
-
  private:
   friend HybridBase;
   torch::jit::mobile::Module module_;
@@ -51,15 +48,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
   PytorchJni(facebook::jni::alias_ref<jstring> modelPath, jint device) {
     LiteJITCallGuard guard;
     module_ = torch::jit::_load_for_mobile(std::move(modelPath->toStdString()));
-    if (device == kDeviceCPU) {
-      deviceType_ = at::kCPU;
-    } else if (device == kDeviceVulkan) {
-      deviceType_ = at::kVulkan;
-    } else {
-      facebook::jni::throwNewJavaException(
-          facebook::jni::gJavaLangIllegalArgumentException,
-          "Unknown device specified");
-    }
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 
   static void registerNatives() {
@@ -108,7 +97,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Module.java b/android/pytorch_android/src/main/java/org/pytorch/Module.java
index 9dafc687f993..62db7042d57b 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Module.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Module.java
@@ -11,16 +11,28 @@ public class Module {
   private INativePeer mNativePeer;
 
   /**
-   * Loads a serialized TorchScript module from the specified path on the disk.
+   * Loads a serialized TorchScript module from the specified path on the disk to run on specified
+   * device.
    *
    * @param modelPath path to file that contains the serialized TorchScript module.
+   * @param device {@link org.pytorch.Device} to use for running specified module.
    * @return new {@link org.pytorch.Module} object which owns torch::jit::Module.
    */
-  public static Module load(final String modelPath) {
+  public static Module load(final String modelPath, final Device device) {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
     }
-    return new Module(new NativePeer(modelPath));
+    return new Module(new NativePeer(modelPath, device));
+  }
+
+  /**
+   * Loads a serialized TorchScript module from the specified path on the disk to run on CPU.
+   *
+   * @param modelPath path to file that contains the serialized TorchScript module.
+   * @return new {@link org.pytorch.Module} object which owns torch::jit::Module.
+   */
+  public static Module load(final String modelPath) {
+    return load(modelPath, Device.CPU);
   }
 
   Module(INativePeer nativePeer) {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
index 5c6ef31061ae..76c0c6226755 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
@@ -13,18 +13,23 @@ class NativePeer implements INativePeer {
   private final HybridData mHybridData;
 
   @DoNotStrip
-  private static native HybridData initHybrid(String moduleAbsolutePath);
+  private static native HybridData initHybrid(String moduleAbsolutePath, int deviceJniCode);
 
   @DoNotStrip
   private static native HybridData initHybridAndroidAsset(
-      String assetName, /* android.content.res.AssetManager */ Object androidAssetManager);
+      String assetName, /* android.content.res.AssetManager */
+      Object androidAssetManager,
+      int deviceJniCode);
 
-  NativePeer(String moduleAbsolutePath) {
-    mHybridData = initHybrid(moduleAbsolutePath);
+  NativePeer(String moduleAbsolutePath, Device device) {
+    mHybridData = initHybrid(moduleAbsolutePath, device.jniCode);
   }
 
-  NativePeer(String assetName, /* android.content.res.AssetManager */ Object androidAssetManager) {
-    mHybridData = initHybridAndroidAsset(assetName, androidAssetManager);
+  NativePeer(
+      String assetName, /* android.content.res.AssetManager */
+      Object androidAssetManager,
+      Device device) {
+    mHybridData = initHybridAndroidAsset(assetName, androidAssetManager, device.jniCode);
   }
 
   public void resetNative() {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
index 15664dd040ea..b775c2bb2e2c 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
@@ -21,9 +21,14 @@ public final class PyTorchAndroid {
    *
    * <p>This method is meant to use in tests and demos.
    */
+  public static Module loadModuleFromAsset(
+      final AssetManager assetManager, final String assetName, final Device device) {
+    return new Module(new NativePeer(assetName, assetManager, device));
+  }
+
   public static Module loadModuleFromAsset(
       final AssetManager assetManager, final String assetName) {
-    return new Module(new NativePeer(assetName, assetManager));
+    return new Module(new NativePeer(assetName, assetManager, Device.CPU));
   }
 
   /**
diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
index c592728ce9f4..37bdb35e2f19 100644
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@@ -40,6 +40,7 @@ android {
         buildConfigField("String", "LOGCAT_TAG", "@string/app_name")
         buildConfigField("long[]", "INPUT_TENSOR_SHAPE", "new long[]{1, 3, 224, 224}")
         buildConfigField("boolean", "NATIVE_BUILD", 'false')
+        buildConfigField("boolean", "USE_VULKAN_DEVICE", 'false')
         addManifestPlaceholders([APP_NAME: "@string/app_name", MAIN_ACTIVITY: "org.pytorch.testapp.MainActivity"])
     }
     buildTypes {
@@ -66,9 +67,17 @@ android {
             addManifestPlaceholders([APP_NAME: "MBQ"])
             buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
         }
+        mbvulkan {
+            dimension "model"
+            applicationIdSuffix ".mbvulkan"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
+            buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
+            addManifestPlaceholders([APP_NAME: "MBQ"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
+        }
         resnet18 {
             dimension "model"
-            applicationIdSuffix ".resneti18"
+            applicationIdSuffix ".resnet18"
             buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.pt\"")
             addManifestPlaceholders([APP_NAME: "RN18"])
             buildConfigField("String", "LOGCAT_TAG", "\"pytorch-resnet18\"")
@@ -122,7 +131,7 @@ android {
 
 tasks.all { task ->
     // Disable externalNativeBuild for all but nativeBuild variant
-    if (task.name.startsWith('externalNativeBuild') 
+    if (task.name.startsWith('externalNativeBuild')
           && !task.name.contains('NativeBuild')) {
         task.enabled = false
     }
@@ -140,8 +149,8 @@ dependencies {
     //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar')
     //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar')
 
-    nightlyImplementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT'
-    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT'
 
     aarImplementation(name:'pytorch_android', ext:'aar')
     aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
diff --git a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
index 5cc233011c8a..bd7469950f87 100644
--- a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
+++ b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
@@ -17,6 +17,7 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.FloatBuffer;
+import org.pytorch.Device;
 import org.pytorch.IValue;
 import org.pytorch.Module;
 import org.pytorch.PyTorchAndroid;
@@ -126,7 +127,11 @@ protected Result doModuleForward() {
       mInputTensorBuffer = Tensor.allocateFloatBuffer((int) numElements);
       mInputTensor = Tensor.fromBlob(mInputTensorBuffer, BuildConfig.INPUT_TENSOR_SHAPE);
       PyTorchAndroid.setNumThreads(1);
-      mModule = PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
+      mModule =
+          BuildConfig.USE_VULKAN_DEVICE
+              ? PyTorchAndroid.loadModuleFromAsset(
+                  getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
+              : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
     }
 
     final long startTime = SystemClock.elapsedRealtime();
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 1bcbae8abeff..839964e33c59 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -51,6 +51,7 @@ file(GLOB cudnn_cpp "cudnn/*.cpp")
 
 file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
 file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
 file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
 file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
@@ -78,6 +79,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
 exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp})
 file(GLOB native_cuda_cpp "native/cuda/*.cpp")
 file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
 file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
 file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
 file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
@@ -372,7 +374,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${cudnn_h} ${hip_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
 endif()
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 1496b6ee551d..1977f945a0fb 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -64,6 +64,11 @@ bool Context::deterministic() const {
 }
 
 void Context::setDeterministic(bool b) {
+  if (b) {
+    TORCH_WARN_ONCE("torch.set_deterministic is in beta, and its design and "
+      " functionality may change in the future.");
+  }
+
   _deterministic = b;
 }
 
@@ -230,4 +235,27 @@ Allocator* getCPUAllocator() {
   return getTHDefaultAllocator();
 }
 
+// override_allow_tf32_flag = true
+//    means the allow_tf32 flags are overrided and tf32 is force disabled
+// override_allow_tf32_flag = false
+//    means the original allow_tf32 flags are followed
+thread_local bool override_allow_tf32_flag = false;
+
+NoTF32Guard::NoTF32Guard() {
+  if (!override_allow_tf32_flag) {
+    changed = true;
+    override_allow_tf32_flag = true;
+  }
+}
+
+NoTF32Guard::~NoTF32Guard() {
+  if (changed) {
+    override_allow_tf32_flag = false;
+  }
+}
+
+bool NoTF32Guard::should_disable_tf32() {
+  return override_allow_tf32_flag;
+}
+
 } // namespace at
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index b8782209def5..fed5e88e5314 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) {
   }
 }
 
+// When the global flag `allow_tf32` is set to true, cuBLAS handles are
+// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
+// For some operators, such as addmv, TF32 offers no performance improvement
+// but causes precision loss. To help this case, this class implements
+// a RAII guard that can be used to quickly disable TF32 within its scope.
+//
+// Usage:
+//     NoTF32Guard disable_tf32;
+struct TORCH_API NoTF32Guard {
+  NoTF32Guard();
+  ~NoTF32Guard();
+  static bool should_disable_tf32();
+private:
+  bool changed = false;
+};
+
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 786fe6214dc3..fd045960b52c 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -43,13 +43,10 @@ DLDataType getDLDataType(const Tensor& t) {
       throw std::logic_error("BFloat16 is not supported by dlpack");
       break;
     case ScalarType::QInt8:
-      throw std::logic_error("QInt8 is not supported by dlpack");
-      break;
     case ScalarType::QUInt8:
-      throw std::logic_error("QUInt8 is not supported by dlpack");
-      break;
     case ScalarType::QInt32:
-      throw std::logic_error("QInt32 is not supported by dlpack");
+    case ScalarType::QUInt4x2:
+      throw std::logic_error("QUInt/QInt types are not supported by dlpack");
       break;
     case ScalarType::ComplexHalf:
       throw std::logic_error("ComplexHalf is not supported by dlpack");
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 53a22db6ff9c..e0fc25c394d3 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -34,6 +34,21 @@
     return __VA_ARGS__();                                                    \
   }
 
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                       \
+    enum_type, type, underlying_type, bitwidth, qmin, qmax, ...)                  \
+  case enum_type: {                                                               \
+    using scalar_t = type;                                                        \
+    using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                      \
+        scalar_t::underlying;                                                     \
+    const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type;      \
+    const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =             \
+        toUnderlying(enum_type);                                                  \
+    int bit_width = bitwidth;                                                     \
+    int64_t quant_min = qmin;                                                     \
+    int64_t quant_max = qmax;                                                     \
+    return __VA_ARGS__();                                                         \
+  }
+
 // This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and
 // should be removed once the bfloat16 bringup is complete on other platforms.
 // This is supposed to be used as a wrapper around the lambda function passed to
@@ -346,6 +361,25 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...)                                   \
+  [&] {                                                                                        \
+    const auto& the_type = TYPE;                                                               \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                    \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                                      \
+    switch (_st) {                                                                             \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__)          \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQUInt8, at::quint8, uint8_t, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)               \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQInt32, at::qint32, int, CHAR_BIT * sizeof(int), INT_MIN, INT_MAX, __VA_ARGS__) \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQUInt4x2, at::quint4x2, uint8_t, 4, 0, 15, __VA_ARGS__)                         \
+      default:                                                                                 \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");                        \
+    }                                                                                          \
+  }()
+
 #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...)                  \
   [&] {                                                                     \
     const auto& the_type = TYPE;                                            \
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
index 40b7ccafbd9a..f0a55470cc1c 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -39,7 +39,7 @@ namespace {
 Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -112,7 +112,7 @@ Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor &
 Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -185,7 +185,7 @@ Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tens
 Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
@@ -316,7 +316,7 @@ Tensor _th_nonzero(const Tensor & self) {
 Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -379,135 +379,10 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const
     }
     return self;
 }
-Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THBoolTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THByteTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THCharTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THDoubleTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THFloatTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THIntTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THLongTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THShortTensor_take(result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_take(const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THBoolTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THByteTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THCharTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THDoubleTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THFloatTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THIntTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THLongTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THShortTensor_take(result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
 Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -573,7 +448,7 @@ Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bo
 Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -639,7 +514,7 @@ Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scala
 std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Byte: {
             auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -746,7 +621,7 @@ std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdi
 Tensor _th_var(const Tensor & self, bool unbiased) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
@@ -765,7 +640,7 @@ Tensor _th_var(const Tensor & self, bool unbiased) {
 Tensor _th_std(const Tensor & self, bool unbiased) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type);
@@ -784,7 +659,7 @@ Tensor _th_std(const Tensor & self, bool unbiased) {
 Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -835,7 +710,7 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
 Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -859,7 +734,7 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -910,7 +785,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
 Tensor _th_trace(const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Byte: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
@@ -951,265 +826,10 @@ Tensor _th_trace(const Tensor & self) {
             AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
     }
 }
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr_ not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1260,7 +880,7 @@ std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1307,7 +927,7 @@ std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors) {
 Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1350,7 +970,7 @@ Tensor _th_potri(const Tensor & self, bool upper) {
 std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1397,7 +1017,7 @@ std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self) {
 Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1444,7 +1064,7 @@ Tensor _th_orgqr(const Tensor & self, const Tensor & input2) {
 Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1495,7 +1115,7 @@ Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & inpu
 std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(J);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1542,7 +1162,7 @@ std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs) {
 Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(result);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
index e6e3fa0fb7e5..1bc9b66777bc 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.h
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_trace(const Tensor & self);
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
index 6cbd974f51dd..d691fec1aa34 100644
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@@ -42,12 +42,12 @@ inline bool _isnan(T val) {
 template <typename T,
          typename std::enable_if<std::is_same<T, at::Half>::value, int>::type = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
-  return at::_isnan(float(val));
+  return at::_isnan(static_cast<float>(val));
 }
 
 
 inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
-  return at::_isnan(float(val));
+  return at::_isnan(static_cast<float>(val));
 }
 
 template <typename T>
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 7ed7f66e2522..6d74e2f47ce0 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -10,9 +10,8 @@ namespace at {
 
 ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
-      debug_info_(c10::ThreadLocalDebugInfo::current()),
-      observers_enabled_(at::isRecordFunctionEnabled()) {
-  callbacks_ = _getTLSCallbacks();
+      debug_info_(c10::ThreadLocalDebugInfo::current()) {
+  rf_tls_ = at::get_record_function_tls_();
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   keep_grad_mode_ = keep_grad_mode;
@@ -31,9 +30,7 @@ void ThreadLocalState::setThreadLocalState(
   }
 #endif
 
-  _setTLSCallbacks(state.callbacks_);
-
-  at::enableRecordFunction(state.observers_enabled_);
+  at::set_record_function_tls_(state.rf_tls_);
 
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 186e521f01bd..f0cb85f0ff84 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -30,10 +30,8 @@ class TORCH_API ThreadLocalState {
   // with DebugInfoGuard
   std::shared_ptr<c10::ThreadLocalDebugInfo> debug_info_;
 
-  // RecordFunction TLS callbacks
-  RecordFunctionCallbacks callbacks_;
-
-  bool observers_enabled_ = false;
+  // RecordFunction TLS
+  RecordFunctionTLS rf_tls_;
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   bool keep_grad_mode_ = true;
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index c248ea461116..2768efe6e683 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
-// wrap each of dims basing on dim_post_expr
-static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions
+static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) {
   if (dim_post_expr <= 0) {
     dim_post_expr = 1; // this will make range [-1, 0]
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
-  for (auto& dim : dims) {
+  for (int64_t i = 0; i < ndims; ++i) {
+    auto &dim = dims[i];
     if (dim < min || dim > max) {
       TORCH_CHECK_INDEX(false,
         "Dimension out of range (expected to be in range of [",
@@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_
   }
 }
 
+// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions
+// E.g. could also be std::array or c10::SmallVector
+template <typename Container>
+inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) {
+  return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr);
+}
+
 // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
 // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
 // to be "skipped" (both for wrap dimension behavior and dimension size checking).
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index cb1ea44d2e7d..8f19cebb1f52 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -357,7 +357,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(hinge_embedding_loss), "hinge_embedding_loss", Tensor (const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(kl_div), "kl_div", Tensor (const Tensor &, const Tensor &, int64_t, bool), fp32)
   KERNEL(ADD_NS(l1_loss), "l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
-  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
   KERNEL(ADD_NS(mse_loss), "mse_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL(ADD_NS(margin_ranking_loss), "margin_ranking_loss", Tensor (const Tensor &, const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index c5e4b0ea3c01..54481814be5b 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -502,6 +502,7 @@ _(aten, multinomial) \
 _(aten, mv) \
 _(aten, mvlgamma) \
 _(aten, nansum) \
+_(aten, nan_to_num) \
 _(aten, narrow) \
 _(aten, narrow_copy) \
 _(aten, native_batch_norm) \
@@ -611,6 +612,7 @@ _(aten, sigmoid) \
 _(aten, sign) \
 _(aten, signbit) \
 _(aten, silu) \
+_(aten, sgn) \
 _(aten, sin) \
 _(aten, sinh) \
 _(aten, size) \
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index b5d552e0e31c..f84352ebee1f 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
 void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
   TORCH_INTERNAL_ASSERT(0,
     op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
+    "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
     "If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
     "Autograd dispatch key for the backend.");
 }
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 5fa379e40710..0942659d2960 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat
 }
 
 bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const {
-  for (auto k : ks) {
-    if (kernels_.find(k) != kernels_.end()) {
-      return true;
-    }
+  TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
+  for (auto& kv : kernels_) {
+    if (ks.has(kv.first)) return true;
   }
   return false;
 }
@@ -196,6 +195,9 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //          In the past we directly call into backends(filled with catchAll) after BackendSelect.
   //          Now that we first call Autograd backend keys after BackendSelect, we should fill those
   //          with catchAll as well.
+  //    The implementation of (2.1) & (2.3) relies on the invariant that for a given backend,
+  //    `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
+  //    backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
   //  (3) Use fallthrough kernel that are registered as fallback.
   //  (4) Use catchAll kernel if available
   // Alias Key Precedence:
@@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
     updateDispatchTableEntry_(dispatcher, k);
   }
-  // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
+  // Note [Refresh Runtime Autograd entries in dispatchTable_]
+  // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
   DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
   updateDispatchTableEntry_(dispatcher, autograd_key);
 }
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index a9182787d2e6..a7b4e694d52e 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -156,18 +156,29 @@ struct FunctionSchema {
     checkSchema();
   }
 
-  // check whether this schema is backward compatible with the old one.
-  // the following conditions are considered as this schema is backward
-  // compatible with old:
-  //   1) two schemas are equal
-  //   2) this schema has the same or more positional args than old,
-  //      and any positional arg in this schema is backward compatible
-  //      with the corresponding one in old schema, which could be an arg
-  //      or a kwarg, if it has, or it must provide a default value
-  //   3) this schema has the same or more kwargs than old, and all the kwargs
-  //      in old schema can find the corresponding kwarg in this schema which
-  //      is backward compatible with the old kwarg, and the extra kwargs in
-  //      this schema must provide default values.
+  // Checks whether this schema is backward compatible with the old one.
+  // The following conditions must be true:
+  // [Function structure] The new schema's name, overload-name, varargs, and
+  //      return arity are the same.
+  // [Output Narrowing] The new schema's output type must be the same class
+  //      or inherit from the old schema's output type.
+  // [Argument count] The new schema must have at least as many arguments as
+  //      the old schema (considering the list of positional and kwargs).
+  // [Arg Compatibility] Every argument in the old schema has a corresponding
+  //      argument in the new schema that:
+  //        * is at the same position.
+  //        * has the same name.
+  //        * is either positional, or kwarg and the old argument was kwarg.
+  //        * has the same type, or the old argument's type inherits from the
+  //          new argument's type.
+  // [Default Values] Every new argument must have a default value.
+  // E.g.
+  //   OK    f_new(a, b, c=1) => f_old(a, b)
+  //   NOK   f_new(a, c=1, *, b) => f_old(a, *, b)
+  //   OK    f_new(a, b, *, c) => f_old(a, *, b, c)
+  //   NOK   f_new(a, *, b, c) -> f_old(a, b, *, c)
+  //   NOK   f_new(a, *, c, b) => f_old(a, *, b, c)
+  //   OK    f_new(a, *, b, c, d=1) => f_old(a, *, b, c)
   bool isBackwardCompatibleWith(
       const FunctionSchema& old,
       std::ostream* why_not = nullptr) const;
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index bc9a68fbad3f..2185b35bc593 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -111,69 +111,35 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     return false;
   }
   for (size_t i = 0; i < returns().size(); ++i) {
-    // functions are covariant in arguments but contravariant in returns
+    // Backwards compatibility requires covariance on argument types
+    // (i.e. more generic), and contravariance on return types (i.e.
+    //  more specific).
     if (!old.returns().at(i).isBackwardCompatibleWith(
           returns().at(i),
           why_not)) {
       return false;
     }
   }
-  std::vector<const Argument*> args, old_args;
-  std::map<std::string, const Argument*> kwargs, old_kwargs;
-  auto split_func = [](const std::vector<Argument>& arguments,
-      std::vector<const Argument*>* positionals,
-      std::map<std::string, const Argument*>* nameds) {
-    for (const Argument& arg : arguments) {
-      if (!arg.kwarg_only()) {
-        positionals->emplace_back(&arg);
-      }
-      nameds->emplace(arg.name(), &arg);
-    }
-  };
-  // we split args into positional and keyward parts,
-  split_func(arguments(), &args, &kwargs);
-  split_func(old.arguments(), &old_args, &old_kwargs);
-  if (old_args.size() > args.size()) {
-    return false;
-  }
-  // make sure that all the old positional args have their corresponding
-  // backward compatible positional args in this schema
-  for (size_t i = 0; i < old_args.size(); ++i) {
-    if (!args.at(i)->isBackwardCompatibleWith(
-          *old_args.at(i),
-          why_not)) {
+
+  // Make sure that all the old arguments have their corresponding backward
+  // compatible arguments in this schema.
+  for (size_t i = 0; i < old.arguments().size(); ++i) {
+    if (!arguments().at(i).isBackwardCompatibleWith(
+          old.arguments().at(i), why_not)) {
       return false;
     }
   }
-  // check the extra positional args in this schema either has corresponding
-  // backward compatible keyward args since positional args also can be used as
-  // a keyward arg, or provided default values
-  for (size_t i = old_args.size(); i < args.size(); ++i) {
-    if (!args.at(i)->default_value()) {
-      auto it = old_kwargs.find(args.at(i)->name());
-      if (it == old_kwargs.end() ||
-          !args.at(i)->isBackwardCompatibleWith(
-            *it->second,
-            why_not)) {
-        return false;
+
+  // Validate that all new arguments provided a default value.
+  for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
+    if (!arguments().at(i).default_value()) {
+      if (why_not) {
+        *why_not
+            << "Function schema not backward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str()
+            << " did not provide a default value.";
       }
-    }
-  }
-  // make sure that all the keyword args in the old schema have their
-  // corresponding backward compatible keyward args in this schema
-  for (auto& kv : old_kwargs) {
-    auto it = kwargs.find(kv.first);
-    if (it == kwargs.end() ||
-        !it->second->isBackwardCompatibleWith(
-          *kv.second,
-          why_not)) {
-      return false;
-    }
-    kwargs.erase(it);
-  }
-  // check all the extra keyword args in this schema provide default values
-  for (auto& kv : kwargs) {
-    if (!kv.second->default_value()) {
       return false;
     }
   }
@@ -186,7 +152,6 @@ inline void FunctionSchema::checkArg(
     const Argument& argument,
     optional<size_t> pos) const {
   if (!value.type()->isSubtypeOf(argument.type())) {
-    std::string position = pos ? ::c10::str(" in position ", *pos) : "";
     TORCH_CHECK(
         false,
         formatTypeMismatchMsg(
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index bce5b27e37b1..69aaf167acee 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -59,6 +59,8 @@ namespace c10 {
   _(prim, Store)                     \
   _(prim, AutogradZero)              \
   _(prim, AutogradAnyNonZero)        \
+  _(prim, AutogradAllNonZero)        \
+  _(prim, AutogradAllZero)           \
   _(prim, Starred)                   \
   _(prim, TupleConstruct)            \
   _(prim, TupleUnpack)               \
@@ -270,6 +272,7 @@ namespace c10 {
   _(prim, grad)                      \
   _(aten, zero_)                     \
   _(aten, fill_)                     \
+  _(aten, masked_fill_)              \
   FORALL_ATEN_BASE_SYMBOLS(_)        \
   _(onnx, Add)                       \
   _(onnx, Concat)                    \
@@ -357,7 +360,8 @@ namespace c10 {
   _(attr, scope)                     \
   _(attr, keepdims)                  \
   _(attr, cache_id)                  \
-  _(attr, new_axis)
+  _(attr, new_axis)                  \
+  _(attr, warn_id)
 #else
 #define FORALL_NS_SYMBOLS(_) \
   _(namespaces, prim)              \
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 74eaa7012ac1..1f8cfbd242b9 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -263,7 +263,12 @@ struct SingleElementType : public Type {
   }
 
  protected:
-  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {}
+  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
+            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
+  }
 
  private:
   TypePtr elem;
@@ -483,6 +488,13 @@ struct CAFFE2_API SymbolicShape {
     dims_ = shape_symbols;
   }
 
+  ShapeSymbol operator[](size_t i) const {
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+    return (*dims_).at(i);
+  }
+
   // Returns rank or nullopt in case of unranked shape.
   c10::optional<size_t> rank() const {
     if(!dims_) {
@@ -543,7 +555,7 @@ struct VaryingShape {
     return dims_ == other.dims_;
   }
 
-  const c10::optional<T>& operator[](int i) const {
+  const c10::optional<T> &operator[](size_t i) const {
     if (!dims_) {
       throw std::runtime_error("Rank isn't fixed");
     }
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 475c59759f78..13e82d434647 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -716,6 +716,9 @@ TupleType::TupleType(
       schema_(std::move(schema)) {
   has_free_variables_ =
       std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) {
+        if (!v) {
+          throw std::runtime_error("Can not create tuple with None type");
+        }
         return v->hasFreeVariables();
       });
   if (schema_) {
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 0e66cb357965..b6cc1db24028 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -239,6 +239,13 @@ struct Vec256 {
     // Specifically map() does not perform the type conversion needed by abs.
     return map([](T x) { return static_cast<T>(std::abs(x)); });
   }
+
+  template <typename other_t_sgn = T,
+            typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0>
+  Vec256<T> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
   template <typename other_t_angle = T,
             typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0>
   Vec256<T> angle() const {
@@ -729,6 +736,14 @@ inline Vec256<T> operator^(const Vec256<T>& a, const Vec256<T>& b) {
 
 #endif
 
+template<class T, typename std::enable_if_t<!std::is_base_of<Vec256i, Vec256<T>>::value, int> = 0>
+inline Vec256<T> operator~(const Vec256<T>& a) {
+  Vec256<T> ones;  // All bits are 1
+  memset((T*) ones, 0xFF, 32);
+  return a ^ ones;
+}
+
+
 template <typename T>
 inline Vec256<T>& operator += (Vec256<T>& a, const Vec256<T>& b) {
   a = a + b;
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
index fbc7a480a4c0..0827b33a3122 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -134,6 +134,16 @@ template <> class Vec256<c10::complex<double>> {
     auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
     return _mm256_and_pd(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_pd();
+    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return blendv(div, zero, mask);
+  }
   __m256d real_() const {
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                      0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
index 892345e9d5c5..ea931acc494b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -171,6 +171,16 @@ template <> class Vec256<c10::complex<float>> {
     auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
     return _mm256_and_ps(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_ps();
+    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return _mm256_blendv_ps(div, zero, mask);
+  }
   __m256 real_() const {
     const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 98afd8bdd33c..30bf6421adb3 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -104,6 +104,8 @@ class Vec256<int64_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size()];
@@ -228,6 +230,8 @@ class Vec256<int32_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size()];
@@ -449,6 +453,8 @@ class Vec256<int16_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size()];
@@ -699,6 +705,8 @@ class Vec256<int8_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int8_t tmp_values[size()];
@@ -879,8 +887,8 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 
 template <typename T, typename Op>
 Vec256<T> inline int_elementwise_binary_256(const Vec256<T>& a, const Vec256<T>& b, Op op) {
-  __at_align32__ T values_a[Vec256<T>::size()];
-  __at_align32__ T values_b[Vec256<T>::size()];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
   a.store(values_a);
   b.store(values_b);
   for (int i = 0; i != Vec256<T>::size(); i++) {
@@ -1039,6 +1047,10 @@ template<class T, typename std::enable_if_t<std::is_base_of<Vec256i, Vec256<T>>:
 inline Vec256<T> operator^(const Vec256<T>& a, const Vec256<T>& b) {
   return _mm256_xor_si256(a, b);
 }
+template<class T, typename std::enable_if_t<std::is_base_of<Vec256i, Vec256<T>>::value, int> = 0>
+inline Vec256<T> operator~(const Vec256<T>& a) {
+  return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
 
 Vec256<int64_t> Vec256<int64_t>::eq(const Vec256<int64_t>& other) const {
   return (*this == other) & Vec256<int64_t>(1);
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 0311399649e7..26423889caa4 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -374,7 +374,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
     // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   } else {
-    AT_ERROR("BFloat16 gemm in CUDA requires Ampere or later GPU");
+    TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
   }
 }
 #endif
@@ -407,19 +407,22 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 #endif
 
 #if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 210)
-  template <>
-  void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
-    // See Note [Writing Nondeterministic Operations]
-    globalContext().alertCuBLASConfigNotDeterministic();
-    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-    cublasOperation_t op = _cublasOpFromChar(trans);
-    _cublasAdjustLdLevel2(m, n, &lda);
-    GEMV_CHECK_ARGVALUES(c10::complex<float>);
-    TORCH_CUDABLAS_CHECK(
-        cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
-        lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
-        reinterpret_cast<cuComplex*>(y), incy));
-  }
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  cublasOperation_t op = _cublasOpFromChar(trans);
+  _cublasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_CUDABLAS_CHECK(
+      cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
+      lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
+      reinterpret_cast<cuComplex*>(y), incy));
+}
 #endif
 
 template <>
@@ -436,6 +439,9 @@ void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
 
 template <>
 void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -492,46 +498,6 @@ void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
 }
 #endif
 
-namespace {
-template<typename scalar_t>
-cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) {
-  TORCH_CHECK(false, "cublas ger is defined only for float and double");
-  return {};
-}
-template<>
-cublasStatus_t cublasGer<float>(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) {
-  return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-template<>
-cublasStatus_t cublasGer<double>(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) {
-  return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-} // anonymous namespace
-
-template<typename scalar_t>
-void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda)
-{
-  _cublasAdjustLdLevel2(m, n, &lda);
-  TORCH_CHECK((m <= INT_MAX) &&
-              (n <= INT_MAX) &&
-              (lda <= INT_MAX) &&
-              (incx <= INT_MAX) &&
-              (incy <= INT_MAX),
-              "cublasSger/cublasDger only supports m, n, lda, incx, incy with "
-              "the bound [val] <= %d", INT_MAX);
-  int i_m = (int)m;
-  int i_n = (int)n;
-  int i_lda = (int)lda;
-  int i_incx = (int)incx;
-  int i_incy = (int)incy;
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(cublasGer<scalar_t>(
-    handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
-}
-template void ger<float>(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
-template void ger<double>(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
-
 /* LEVEL 1 BLAS FUNCTIONS */
 
 template <>
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 404f322545f8..0165c53ac60d 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -45,7 +45,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
-  if (at::globalContext().allowTF32CuBLAS()) {
+  if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index 615ba3e92b71..80e39c6bc6bc 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -19,20 +19,23 @@ class CuDNNError : public c10::Error {
 
 }  // namespace c10
 
+#define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__)
+
 // See Note [CHECK macro]
-#define AT_CUDNN_CHECK(EXPR)                                                                  \
-  do {                                                                                        \
-    cudnnStatus_t status = EXPR;                                                              \
-    if (status != CUDNN_STATUS_SUCCESS) {                                                     \
-      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                             \
-        TORCH_CHECK_WITH(CuDNNError, false,                                                   \
-            "cuDNN error: ",                                                                  \
-            cudnnGetErrorString(status),                                                      \
-            ". This error may appear if you passed in a non-contiguous input.");              \
-      } else {                                                                                \
-        TORCH_CHECK_WITH(CuDNNError, false, "cuDNN error: ", cudnnGetErrorString(status));    \
-      }                                                                                       \
-    }                                                                                         \
+#define AT_CUDNN_CHECK(EXPR, ...)                                                               \
+  do {                                                                                          \
+    cudnnStatus_t status = EXPR;                                                                \
+    if (status != CUDNN_STATUS_SUCCESS) {                                                       \
+      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                               \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ",                                                                    \
+            cudnnGetErrorString(status),                                                        \
+            ". This error may appear if you passed in a non-contiguous input.", ##__VA_ARGS__); \
+      } else {                                                                                  \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ", cudnnGetErrorString(status), ##__VA_ARGS__);                       \
+      }                                                                                         \
+    }                                                                                           \
   } while (0)
 
 namespace at { namespace cuda { namespace blas {
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index c43d53751aee..b2d8df49f51b 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -435,144 +435,6 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const
     }
     return self;
 }
-Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_take(const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
 Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 58f6a8d53e92..28b9738034e7 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -28,6 +28,10 @@
 #include <miopen/version.h>
 #endif
 
+#ifndef USE_ROCM
+#include <ATen/cuda/detail/LazyNVRTC.h>
+#endif
+
 #include <cuda.h>
 
 #include <sstream>
@@ -116,10 +120,14 @@ bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
 
-#ifdef USE_DIRECT_NVRTC
+#if defined(USE_DIRECT_NVRTC)
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
   return std::make_pair(nullptr, at::cuda::load_nvrtc());
 }
+#elif !defined(USE_ROCM)
+static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
+  return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
+}
 #else
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
 #if defined(_WIN32)
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
new file mode 100644
index 000000000000..fae48c08b61f
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -0,0 +1,171 @@
+#include <ATen/cuda/detail/LazyNVRTC.h>
+
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/DynamicLibrary.h>
+#include <stdexcept>
+
+namespace at {
+namespace cuda {
+namespace detail {
+namespace _stubs {
+
+at::DynamicLibrary& getCUDALibrary() {
+#if defined(_WIN32)
+  static at::DynamicLibrary lib("nvcuda.dll");
+#else
+  static at::DynamicLibrary lib("libcuda.so.1");
+#endif
+  return lib;
+}
+
+at::DynamicLibrary& getNVRTCLibrary() {
+  constexpr auto major = CUDA_VERSION / 1000;
+  constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
+#if defined(_WIN32)
+  auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
+#else
+  static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
+#endif
+  static at::DynamicLibrary lib(libname.c_str());
+  return lib;
+}
+
+#define _STUB_1(LIB, NAME, RETTYPE, ARG1)                                            \
+RETTYPE NAME(ARG1 a1) {                                                              \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1);                                                                     \
+}
+
+#define _STUB_2(LIB, NAME, RETTYPE, ARG1, ARG2)                                      \
+RETTYPE NAME(ARG1 a1, ARG2 a2) {                                                     \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2);                                                                 \
+}
+
+#define _STUB_3(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3)                                \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3) {                                            \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3);                                                             \
+}
+
+#define _STUB_4(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3, ARG4)                          \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {                                   \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3, a4);                                                         \
+}
+
+#define CUDA_STUB1(NAME, A1) _STUB_1(CUDA, NAME, CUresult CUDAAPI, A1)
+#define CUDA_STUB2(NAME, A1, A2) _STUB_2(CUDA, NAME, CUresult CUDAAPI, A1, A2)
+#define CUDA_STUB3(NAME, A1, A2, A3) _STUB_3(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3)
+#define CUDA_STUB4(NAME, A1, A2, A3, A4) _STUB_4(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3, A4)
+
+#define NVRTC_STUB1(NAME, A1) _STUB_1(NVRTC, NAME, nvrtcResult, A1)
+#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
+#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)
+
+NVRTC_STUB2(nvrtcVersion, int*, int*);
+NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
+
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames) {
+  auto fn = reinterpret_cast<decltype(&nvrtcCreateProgram)>(getNVRTCLibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get nvrtcCreateProgram");
+  lazyNVRTC.nvrtcCreateProgram = fn;
+  return fn(prog, src, name, numHeaders, headers, includeNames);
+}
+
+NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
+NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
+NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
+_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
+NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
+NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
+
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
+CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
+CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
+CUDA_STUB2(cuGetErrorString, CUresult, const char **);
+CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
+CUDA_STUB1(cuModuleUnload, CUmodule);
+CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
+CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
+CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
+
+// Irregularly shaped functions
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra) {
+  auto fn = reinterpret_cast<decltype(&cuLaunchKernel)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLaunchKernel");
+  lazyNVRTC.cuLaunchKernel = fn;
+  return fn(f,
+            gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+            sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module,
+                                    const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuModuleLoadDataEx)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuModuleLoadDataEx");
+  lazyNVRTC.cuModuleLoadDataEx = fn;
+  return fn(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state,
+              CUjitInputType type,
+              void *data,
+              size_t size,
+              const char *name,
+              unsigned int numOptions,
+              CUjit_option *options,
+              void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuLinkAddData)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLinkAddData");
+  lazyNVRTC.cuLinkAddData = fn;
+  return fn(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+} // namespace _stubs
+
+NVRTC lazyNVRTC = {
+#define _REFERENCE_MEMBER(name) _stubs::name,
+  AT_FORALL_NVRTC(_REFERENCE_MEMBER)
+#undef _REFERENCE_MEMBER
+};
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.h b/aten/src/ATen/cuda/detail/LazyNVRTC.h
new file mode 100644
index 000000000000..810e1c322dbd
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at { namespace cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+
+namespace detail {
+extern NVRTC lazyNVRTC;
+}
+
+}}  // at::cuda::detail
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index 4630465115c7..00e57ca63520 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -42,6 +42,7 @@ namespace at { namespace cuda {
   _(nvrtcGetProgramLog)                          \
   _(nvrtcGetLoweredName)                         \
   _(cuModuleLoadData)                            \
+  _(cuModuleLoadDataEx)                          \
   _(cuModuleGetFunction)                         \
   _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
   _(cuGetErrorString)                            \
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 2863212a03a8..aba7b407162f 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -4,7 +4,6 @@
 
 #include <ostream>
 #include <sstream>
-#include <string>
 
 namespace at { namespace native {
 
@@ -144,4 +143,38 @@ void FilterDescriptor::set(const at::Tensor &t, int64_t pad, bool force_nhwc) {
   set(getDataType(t), (int) dim, size, filter_format);
 }
 
+std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
+  switch (tformat) {
+    case CUDNN_TENSOR_NCHW:
+      return "CUDNN_TENSOR_NCHW";
+    case CUDNN_TENSOR_NHWC:
+      return "CUDNN_TENSOR_NHWC";
+    default:
+      std::ostringstream oss;
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      return oss.str();
+  }
+}
+
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  int nbDims;
+  int dimA[CUDNN_DIM_MAX];
+  cudnnDataType_t dtype;
+  cudnnTensorFormat_t tformat;
+  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
+  // Read out only nbDims of the arrays!
+  out << "    dimA = ";
+  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  return out;
+}
+
+void FilterDescriptor::print() { std::cout << *this; }
+
 }}
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 04e027491709..2aed3f66632f 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <string>
+
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 
@@ -12,6 +14,8 @@
 
 namespace at { namespace native {
 
+std::string cudnnTypeToString(cudnnDataType_t dtype);
+
 // TODO: Add constructors for all of the descriptors
 
 inline int dataSize(cudnnDataType_t dataType)
@@ -153,12 +157,15 @@ class TORCH_CUDA_API FilterDescriptor
 public:
   void set(const at::Tensor &t, int64_t pad = 0, bool force_nhwc = false);
 
+  void print();
 private:
   void set(cudnnDataType_t dataType, int dim, int* size, cudnnTensorFormat_t filter_format) {
     AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, filter_format, dim, size));
   }
 };
 
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
+
 struct TORCH_CUDA_API ConvolutionDescriptor
   : public Descriptor<cudnnConvolutionStruct,
                       &cudnnCreateConvolutionDescriptor,
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index f4babb2a14a3..e7e5659babbb 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -66,14 +66,16 @@ extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
 
 // syev
+extern "C" void zheev_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *info);
+extern "C" void cheev_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *info);
 extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
 extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
 
 // gesdd
 extern "C" void zgesdd_(char *jobz, int *m, int *n, std::complex<double> *a, int *lda,
-                        double *s, std::complex<double> *u, int *ldu, std::complex<double> *vt, int *ldvt, std::complex<double> *work, int *lwork, int *rwork, int *iwork, int *info);
+                        double *s, std::complex<double> *u, int *ldu, std::complex<double> *vt, int *ldvt, std::complex<double> *work, int *lwork, double *rwork, int *iwork, int *info);
 extern "C" void cgesdd_(char *jobz, int *m, int *n, std::complex<float> *a, int *lda,
-                        float *s, std::complex<float> *u, int *ldu, std::complex<float> *vt, int *ldvt, std::complex<float> *work, int *lwork, int *rwork, int *iwork, int *info);
+                        float *s, std::complex<float> *u, int *ldu, std::complex<float> *vt, int *ldvt, std::complex<float> *work, int *lwork, float *rwork, int *iwork, int *info);
 extern "C" void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda,
                         double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
 extern "C" void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda,
@@ -116,12 +118,12 @@ void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *wo
 template<class scalar_t>
 void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
 
-template<class scalar_t>
-void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info);
+template<class scalar_t, class value_t=scalar_t>
+void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
 
 template<class scalar_t, class value_t=scalar_t>
 void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda,
-               value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *rwork, int *iwork, int *info);
+               value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
 
 template<class scalar_t>
 void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
@@ -255,33 +257,43 @@ template<> void lapackOrgqr<float>(int m, int n, int k, float *a, int lda, float
   sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
 }
 
-template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, int *info) {
+template<> void lapackSymeig<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int *info) {
+  zheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, info);
+}
+
+template<> void lapackSymeig<c10::complex<float>, float>(char jobz, char uplo, int n, c10::complex<float> *a, int lda, float *w, c10::complex<float> *work, int lwork, float *rwork, int *info) {
+  cheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, w, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, info);
+}
+
+template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double* rwork, int *info) {
+  (void)rwork;  // unused
   dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }
 
-template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, int *info) {
+template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float* rwork, int *info) {
+  (void)rwork;  // unused
   ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }
 
 template<> void lapackSvd<c10::complex<double>, double>(char jobz, int m, int n, c10::complex<double> *a, int lda,
-                                  double *s, c10::complex<double> *u, int ldu, c10::complex<double> *vt, int ldvt, c10::complex<double> *work, int lwork, int *rwork, int *iwork, int *info) {
+                                  double *s, c10::complex<double> *u, int ldu, c10::complex<double> *vt, int ldvt, c10::complex<double> *work, int lwork, double *rwork, int *iwork, int *info) {
   zgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, s, reinterpret_cast<std::complex<double>*>(u), &ldu,
           reinterpret_cast<std::complex<double>*>(vt), &ldvt, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, iwork, info);
 }
 
 template<> void lapackSvd<c10::complex<float>, float>(char jobz, int m, int n, c10::complex<float> *a, int lda,
-                                 float *s, c10::complex<float> *u, int ldu, c10::complex<float> *vt, int ldvt, c10::complex<float> *work, int lwork, int *rwork, int *iwork, int *info) {
+                                 float *s, c10::complex<float> *u, int ldu, c10::complex<float> *vt, int ldvt, c10::complex<float> *work, int lwork, float *rwork, int *iwork, int *info) {
   cgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<float>*>(a), &lda, s, reinterpret_cast<std::complex<float>*>(u), &ldu,
           reinterpret_cast<std::complex<float>*>(vt), &ldvt, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, iwork, info);
 }
 
 template<> void lapackSvd<double>(char jobz, int m, int n, double *a, int lda,
-                                  double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, int *rwork, int *iwork, int *info) {
+                                  double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, double *rwork, int *iwork, int *info) {
   dgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
 template<> void lapackSvd<float>(char jobz, int m, int n, float *a, int lda,
-                                 float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, int *rwork, int *iwork, int *info) {
+                                 float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, float *rwork, int *iwork, int *info) {
   sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
@@ -859,7 +871,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
-  auto eigvals_data = eigvals.data_ptr<scalar_t>();
+  auto eigvals_data = eigvals.data_ptr<value_t>();
   auto self_matrix_stride = matrixStride(self);
   auto eigvals_stride = eigvals.size(-1);
   auto batch_size = batchCount(self);
@@ -875,16 +887,26 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
-  lapackSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, &info);
+
+  Tensor rwork;
+  value_t* rwork_data = nullptr;
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    int64_t lrwork = std::max(int64_t(1), 3 * n - 2);
+    ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    rwork = at::empty({lrwork}, self.options().dtype(dtype));
+    rwork_data = rwork.data_ptr<value_t>();
+  }
+
+  lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info);
   lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
 
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+    value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
 
     // now compute the eigenvalues and the eigenvectors (optionally)
-    lapackSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
+    lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, rwork_data, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -898,14 +920,15 @@ std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvect
 
   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
-  auto eigvals = at::empty(self_sizes, self.options());
+  ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+  auto eigvals = at::empty(self_sizes, self.options().dtype(dtype));
 
   if (self.numel() == 0) {
     return std::tuple<Tensor, Tensor>(eigvals, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
   }
 
   auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cpu", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cpu", [&]{
     apply_symeig<scalar_t>(self_working_copy, eigvals, eigenvectors, upper, infos);
   });
 
@@ -958,22 +981,15 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   auto m = self.size(-2);
   auto n = self.size(-1);
   auto mn = std::min(m, n);
-  Tensor iwork = at::empty({8*mn}, at::kInt);
+  Tensor iwork = at::empty({8 * mn}, at::kInt);
   auto iwork_data = iwork.data_ptr<int>();
   Tensor rwork;
-  int* rwork_data = nullptr;
+  value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto mx = std::max(m, n);
-    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
-    if (jobz == 'N'){
-      lrwork = 7 * mn;
-    }else if (mx > 10 * mn){
-      lrwork = 7 * mn * mn + 7 * mn;
-    } else {
-      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
-    }
-    rwork = at::empty({std::max(int64_t(1), lrwork)}, at::kInt);
-    rwork_data = rwork.data_ptr<int>();
+    auto lrwork  = computeLRWorkDim(jobz, m, n);
+    // rwork is an array of floats or doubles depending on the type
+    rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype()));
+    rwork_data = rwork.data_ptr<value_t>();
   }
 
   // Run once, first to get the optimum work size.
@@ -992,7 +1008,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
     value_t* S_working_ptr = &S_data[i * S_stride];
     scalar_t* U_working_ptr = &U_data[i * U_stride];
     scalar_t* VT_working_ptr = &VT_data[i * VT_stride];
-    
+
     // Compute S, U (optionally) and VT (optionally)
     lapackSvd<scalar_t, value_t>(jobz, m, n, self_working_ptr, m,
                         S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work_data, lwork, rwork_data, iwork_data, &info);
@@ -1008,7 +1024,7 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cpu(const Tensor& self, bool some
   std::vector<int64_t> infos(batchCount(self), 0);
   int64_t m = self.size(-2), n = self.size(-1);
   int64_t k = std::min(m, n);
-  
+
   char jobz = compute_uv ? (some ? 'S' : 'A') : 'N';
 
   Tensor U_working_copy, S_working_copy, VT_working_copy;
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index fc55379578ff..f8af756773c9 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -175,7 +175,7 @@ Tensor& divide_(Tensor& self, Scalar other) {
 
 // true_divide, an alias for div
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
-  return native::div_out(result, self, divisor);
+  return at::div_out(result, self, divisor);
 }
 
 Tensor true_divide(const Tensor& self, const Tensor& divisor) {
@@ -390,14 +390,16 @@ Tensor rsub(const Tensor& self, const Tensor& other, Scalar alpha) {
 }
 
 Tensor& atan2_out(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(result, self, other);
+  auto iter = TensorIterator::binary_float_op(result, self, other);
   atan2_stub(iter.device_type(), iter);
   return result;
 }
 
 Tensor atan2(const Tensor& self, const Tensor& other) {
-  Tensor result = at::empty({0}, self.options());
-  return native::atan2_out(result, self, other);
+  Tensor result;
+  auto iter = TensorIterator::binary_float_op(result, self, other);
+  atan2_stub(iter.device_type(), iter);
+  return iter.output();
 }
 
 Tensor& atan2_(Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index e2dad35eb7ec..7640c8bd84ac 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -10,7 +10,8 @@ namespace at { namespace native {
 inline void alpha_check(const ScalarType dtype, Scalar alpha) {
   TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool,
               "Boolean alpha only supported for Boolean results.");
-  TORCH_CHECK(isFloatingType(dtype) || alpha.isIntegral(true),
+  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype) 
+              || alpha.isIntegral(true),
               "For integral input tensors, argument alpha must not be a floating point number.");
 }
 
@@ -25,6 +26,7 @@ inline void sub_check(const Tensor& self, const Tensor& other) {
 }
 
 using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
+using binary_fn_beta = void(*)(TensorIterator&, double beta);
 using binary_fn = void(*)(TensorIterator&);
 using binary_clamp_fn_alpha =
     void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val);
@@ -54,7 +56,7 @@ DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, maximum_stub);
 DECLARE_DISPATCH(binary_fn, minimum_stub);
-DECLARE_DISPATCH(binary_fn, smooth_l1_stub);
+DECLARE_DISPATCH(binary_fn_beta, smooth_l1_stub);
 DECLARE_DISPATCH(binary_fn, sigmoid_backward_stub);
 DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
 DECLARE_DISPATCH(binary_fn, tanh_backward_stub);
diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index b8830691f47a..3fde6dbb77e1 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -4,12 +4,25 @@
 
 namespace at { namespace native {
 
-inline std::vector<int64_t> computeStrideForViewAsReal(IntArrayRef oldstride) {
-  auto res = oldstride.vec();
-  for(size_t i = 0; i < res.size(); i++) {
-    res[i] = res[i] * 2;
+// View tensor with new dtype, storage offset, sizes and strides 
+inline Tensor view_tensor(
+    const Tensor &tensor, ScalarType dtype,
+    int64_t offset, IntArrayRef sizes, IntArrayRef strides) {
+  Storage storage = tensor.storage();
+  auto new_tensor = detail::make_tensor<TensorImpl>(
+      std::move(storage), tensor.key_set(), scalarTypeToTypeMeta(dtype));
+  auto * impl = new_tensor.unsafeGetTensorImpl();
+  impl->set_storage_offset(offset);
+  impl->set_sizes_and_strides(sizes, strides);
+  return new_tensor;
+}
+
+inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
+  DimVector res(oldstride.size() + 1);
+  for(size_t i = 0; i < oldstride.size(); i++) {
+    res[i] = oldstride[i] * 2;
   }
-  res.emplace_back(1);
+  res.back() = 1;
   return res;
 }
 
@@ -18,25 +31,25 @@ inline std::vector<int64_t> computeStrideForViewAsReal(IntArrayRef oldstride) {
 // in the last two dimensions
 Tensor view_as_real(const Tensor& self) {
   TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors");
-  auto new_sizes = self.sizes().vec();
+  auto old_sizes = self.sizes();
+  DimVector new_sizes(old_sizes.size() + 1);
+  std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
   // last dimension will always have two elements containing the real and imag vals
-  new_sizes.emplace_back(2);
+  new_sizes.back() = 2;
   auto new_strides = computeStrideForViewAsReal(self.strides());
   auto new_storage_offset = 2 * self.storage_offset();
   const auto float_type = c10::toValueType(self.scalar_type());
-  return at::empty({0}, self.options().dtype(float_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides);
+  return view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides);
 }
 
-inline std::vector<int64_t> computeStrideForViewAsComplex(IntArrayRef oldstride) {
-  auto res = oldstride.vec();
-  int dim = res.size();
-
-  TORCH_CHECK(res[dim-1] == 1, "Tensor must have a last dimension with stride 1");
-  res.pop_back();
+inline DimVector computeStrideForViewAsComplex(IntArrayRef oldstride) {
+  const int64_t dim = oldstride.size();
+  TORCH_CHECK(oldstride[dim-1] == 1, "Tensor must have a last dimension with stride 1");
 
-  for (auto i = decltype(res.size()){0}; i < res.size(); i++) {
-    TORCH_CHECK(res[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
-    res[i] = res[i] / 2;
+  DimVector res(dim - 1);
+  for (int64_t i = 0; i < res.size(); i++) {
+    TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
+    res[i] = oldstride[i] / 2;
   }
   return res;
 }
@@ -48,10 +61,10 @@ Tensor view_as_complex(const Tensor& self) {
     self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf,
     "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
 
-  TORCH_CHECK(self.dim() != 0, "Input tensor must have one or more dimensions");
-  auto new_sizes = self.sizes().vec();
-  TORCH_CHECK(new_sizes[self.dim()-1] == 2, "Tensor must have a last dimension of size 2");
-  new_sizes.pop_back();
+  auto old_sizes = self.sizes();
+  TORCH_CHECK(old_sizes.size() != 0, "Input tensor must have one or more dimensions");
+  TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
+  DimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
 
   const auto new_strides = computeStrideForViewAsComplex(self.strides());
   const auto complex_type = c10::toComplexType(self.scalar_type());
@@ -59,7 +72,7 @@ Tensor view_as_complex(const Tensor& self) {
   TORCH_CHECK(self.storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
   const auto new_storage_offset = self.storage_offset() / 2;
 
-  return at::empty({0}, self.options().dtype(complex_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides);
+  return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides);
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ea7903369e93..aa3a2debfe0a 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -62,6 +62,7 @@ std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
       << "  benchmark = " << params.benchmark
       << "  deterministic = " << params.deterministic
       << "  cudnn_enabled = " << params.cudnn_enabled
+      << "  allow_tf32 = " << params.allow_tf32
       << "}";
   return out;
 }
@@ -198,6 +199,9 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co
   if (!input.is_cuda() || !cudnn_enabled) {
     return false;
   }
+  if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
+    return false;
+  }
   if (!cudnn_conv_use_channels_last(input, weight)) { // bypass dilation checks for channels-last convolution
     if (deterministic && is_dilated()) {
       // cudnn doesn't support deterministic dilated convolution fully yet
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 9bc6b476e221..6a0ca1e67900 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -191,11 +191,10 @@ static void slow_conv2d_update_output_frame(
       output.reshape({n_output_plane, output_height * output_width});
   if (bias.defined()) {
     output.copy_(bias.unsqueeze(-1).unsqueeze(-1));
+    output2d.addmm_(weight, finput, 1, 1);
   } else {
-    output.zero_();
+    output2d.addmm_(weight, finput, 0, 1);
   }
-
-  output2d.addmm_(weight, finput, 1, 1);
 }
 
 void slow_conv2d_backward_update_grad_input_frame(
@@ -434,16 +433,23 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
 
   const int64_t batch_size = input.size(0);
 
-  finput.resize_({batch_size,
+  if ((input.ndimension() == 4) && (kernel_height == 1) && (stride_height == 1) && (pad_height == 0) &&
+      (kernel_width == 1) && (stride_width == 1) && (pad_width == 0)) {
+    finput =
+        input.view({batch_size, n_input_plane, output_height * output_width})
+            .detach();
+  } else {
+     finput.resize_({batch_size,
                   n_input_plane * kernel_height * kernel_width,
                   output_height * output_width});
+  }
   output.resize_({batch_size, n_output_plane, output_height, output_width});
 
   at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
     NoGradGuard no_grad;
     AutoNonVariableTypeMode non_variable_type_mode;
     for (int64_t t = start; t < end; t++) {
-      Tensor input_t = input[t];
+      Tensor input_t = input[t].unsqueeze(0);
       Tensor output_t = output[t];
       Tensor finput_t = finput[t];
       slow_conv2d_update_output_frame(
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index d5a29a3abbe1..95263617e2a8 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -581,9 +581,15 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(
       (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
 
   const int64_t batch_size = input.size(0);
-  finput.resize_({batch_size,
-                  n_input_plane * kernel_depth * kernel_height * kernel_width,
-                  output_depth * output_height * output_width});
+  if ((kernel_depth == 1) && (kernel_height == 1) && (kernel_width == 1) &&
+      (pad_depth == 0) && (pad_height == 0) && (pad_width == 0) &&
+      (stride_depth == 1) && (stride_height == 1) && (stride_width == 1) && (groups == 1)) {
+    finput = input.view({batch_size, n_input_plane, output_height * output_width * output_depth}).detach();
+  } else {
+    finput.resize_({batch_size,
+                    n_input_plane * kernel_depth * kernel_height * kernel_width,
+                    output_depth * output_height * output_width});
+  }
   output.resize_(
       {batch_size, n_output_plane, output_depth, output_height, output_width});
 
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 79fb0a11fba4..23b81a655507 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -34,7 +34,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
   }
   Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
 
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
     scalar_t* sp = src.data_ptr<scalar_t>();
     scalar_t* rp = self.data_ptr<scalar_t>();
     scalar_t* bp = buf.data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 912b5116c4cc..73eb2070c07d 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -24,6 +24,26 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
+#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
+void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+                                                                                                                        \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+      tensors[i].NAME##_(scalars[i]);                                                                                   \
+    }                                                                                                                   \
+}                                                                                                                       \
+                                                                                                                        \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+  std::vector<Tensor> result;                                                                                           \
+  result.reserve(tensors.size());                                                                                       \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
+  }                                                                                                                     \
+                                                                                                                        \
+  return result;                                                                                                        \
+}
+
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
+FOREACH_BINARY_OP_SCALARLIST(add);
+FOREACH_BINARY_OP_SCALARLIST(sub);
+FOREACH_BINARY_OP_SCALARLIST(mul);
+FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 5a7aced74702..f634d4804a6d 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
+void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+}
+
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -68,7 +74,7 @@ bool can_use_fast_route(TensorList tensors, Scalar scalar) {
       return false;
     }
 
-    // integral scalar + boolean tensor will result in integral tensor 
+    // integral scalar + boolean tensor will result in integral tensor
     if (scalar.isIntegral(/*includeBool*/ false) && t.dtype() == at::kBool) {
       return false;
     }
@@ -83,17 +89,17 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) {
   for (int64_t i = 0; i < tensors1.size(); i++) {
     TORCH_CHECK(tensors1[i].sizes() == tensors2[i].sizes(), "Corresponding tensors from tensor lists have different size.");
 
-    if (tensors1[i].device() != expected_device || 
+    if (tensors1[i].device() != expected_device ||
         tensors2[i].device() != expected_device) {
       return false;
     }
 
-    if (tensors1[i].layout() != at::kStrided || 
+    if (tensors1[i].layout() != at::kStrided ||
         tensors2[i].layout() != at::kStrided) {
       return false;
     }
 
-    if (tensors1[i].device() != expected_device || 
+    if (tensors1[i].device() != expected_device ||
         tensors2[i].device() != expected_device) {
       return false;
     }
@@ -102,7 +108,7 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) {
       return false;
     }
 
-    if (!tensors1[i].is_non_overlapping_and_dense() || 
+    if (!tensors1[i].is_non_overlapping_and_dense() ||
         !tensors2[i].is_non_overlapping_and_dense()) {
       return false;
     }
@@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
+bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+
+  return can_use_fast_route(tensors);
+}
+
 }
 }} // at::native
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index e93eb11f642c..9c3742c129de 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) {
 }
 
 Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  return at::_addr(b_self, vec1, vec2, beta, alpha);
+  TORCH_WARN(
+    "torch.addr is deprecated and may be removed in a future PyTorch release. "
+    "This function can be implemented using torch.outer as "
+    "alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, "
+    "alpha * torch.outer(vec1, vec2) when beta is zero.");
+
+  Tensor outer_result = at::outer(vec1, vec2) * alpha;
+  if (beta.to<double>() == 0.0) {
+    return outer_result;
+  }
+  return outer_result + (self * beta);
 }
 
 Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  return at::_addr_(self, vec1, vec2, beta, alpha);
+  return at::addr_out(self, self, vec1, vec2, beta, alpha);
 }
 
 Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out");
-  return at::_addr_out(result, b_self, vec1, vec2, beta, alpha);
+  auto addr_result = at::addr(self, vec1, vec2, beta, alpha);
+  // Validates safe casting
+  const auto result_dtype = addr_result.scalar_type();
+  TORCH_CHECK(canCast(result_dtype, result.scalar_type()),
+              "result type ", result_dtype,
+              " can't be cast to the desired output type ", result.scalar_type());
+
+  at::native::resize_output(result, addr_result.sizes().vec());
+  result.copy_(addr_result);
+  return result;
 }
 
+// torch.ger, alias for torch.outer
 Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  check_1d(self, "self", "ger");
-  check_1d(vec2, "vec2", "ger");
-  if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) {
-    result.resize_({ self.size(0), vec2.size(0) });
-  }
-  // resize_ does the "broadcasting", don't need to broadcast again.
-  return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1));
+  TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. "
+             "Use torch.outer instead.");
+  return at::outer_out(result, self, vec2);
 }
 
 Tensor ger(const Tensor& self, const Tensor& vec2) {
-  Tensor result = at::empty({0}, self.options());
-  at::ger_out(result, self, vec2);
-  return result;
+  return self.outer(vec2);
 }
 
-// torch.outer, alias for torch.ger
 Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  return at::ger_out(result, self, vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  // torch.outer is implemented as a composite op using reshape and mul
+  at::mul_out(result, self.reshape({self.size(0), 1}), vec2);
+  return result;
 }
 
 Tensor outer(const Tensor& self, const Tensor& vec2) {
-  return self.ger(vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  return self.reshape({self.size(0), 1}) * vec2;
 }
 
 static void addmm_impl_cpu_(
@@ -1223,6 +1234,8 @@ Tensor matrix_exp(const Tensor& a) {
               "matrix_exp(", a.scalar_type(), "{", a.sizes(), "}): expected a tensor "
               "of squared matrices");
 
+  NoTF32Guard disable_tf32;
+
   if (a.size(-1) == 1) {
     return a.exp();
   }
@@ -1231,6 +1244,7 @@ Tensor matrix_exp(const Tensor& a) {
 }
 
 Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
+  NoTF32Guard disable_tf32;
   return backward_analytic_function_of_a_matrix(
     self, grad,
     [](const Tensor& a) {
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 5c07700f1e85..4a6af18a5a96 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -318,4 +318,19 @@ static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_
   return reverse_permutation;
 }
 
+// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd
+// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
+static inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) {
+  auto mn = std::min(m, n);
+  auto mx = std::max(m, n);
+  // These settings are valid for on LAPACK 3.6+
+  if (jobz == 'N') {
+    return 5 * mn;
+  }
+  if (mx > 10 * mn) {
+    return 5 * mn * mn + 5 * mn;
+  }
+  return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn);
+}
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 8dc5432d8a8c..2a3e97cf5dd8 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -295,24 +295,41 @@ Tensor soft_margin_loss(
   return output;
 }
 
-Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction) {
+Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
+      return at::native::l1_loss(input, target, reduction);
+  }
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
-  smooth_l1_stub(iter.device_type(), iter);
+  smooth_l1_stub(iter.device_type(), iter, beta);
   return apply_loss_reduction(iter.output(), reduction);
 }
 
-Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
+      return at::native::l1_loss_out(result, input, target, reduction);
+  }
   if (reduction != Reduction::None) {
-    result = at::smooth_l1_loss(input, target, reduction);
+    Tensor loss;
+    auto iter = TensorIterator::binary_op(loss, input, target);
+    smooth_l1_stub(iter.device_type(), iter, beta);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(result, iter.output(), 0);
+    } else {
+      at::sum_out(result, iter.output(), 0);
+    }
   } else {
     auto iter = TensorIterator::binary_op(result, input, target);
-    smooth_l1_stub(iter.device_type(), iter);
+    smooth_l1_stub(iter.device_type(), iter, beta);
   }
   return result;
 }
 
-Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
   auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
@@ -320,13 +337,15 @@ Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_outpu
     .add_input(target)
     .add_input(grad_output)
     .build();
-  smooth_l1_backward_stub(iter.device_type(), iter, norm);
+  smooth_l1_backward_stub(iter.device_type(), iter, norm, beta);
   return grad_input;
 }
 
-Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward(grad_output, input, target, reduction);
   auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
+  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction, beta);
 }
 
 Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
index a0298ea937de..645822f55065 100644
--- a/aten/src/ATen/native/MaxPooling.cpp
+++ b/aten/src/ATen/native/MaxPooling.cpp
@@ -97,6 +97,10 @@ Tensor max_pool1d(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode) {
+  if (self.is_quantized()) {
+    return at::quantized_max_pool1d(self, kernel_size, stride, padding,
+                                    dilation, ceil_mode);
+  }
   if (self.requires_grad() || !self.device().is_cpu()) {
     // Needs indices for grad and with_indices defines CUDA dispatch
     return std::get<0>(at::max_pool1d_with_indices(
diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index 2ae5fb0f9d59..f8f0231b181c 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -25,7 +25,7 @@ Tensor empty_meta(
     // participate in dispatch, but so that tests like is_sparse/is_cuda
     // give the correct result (a CUDA meta tensor "is cuda").  If we don't
     // like this, remove the computeDispatchKey line
-    DispatchKeySet{DispatchKey::Meta, computeDispatchKey(options)},
+    DispatchKeySet{DispatchKey::Meta, options.computeDispatchKey()},
     dtype,
     device
   );
diff --git a/aten/src/ATen/native/PointwiseOps.h b/aten/src/ATen/native/PointwiseOps.h
index e81a89454905..98df21121ba3 100644
--- a/aten/src/ATen/native/PointwiseOps.h
+++ b/aten/src/ATen/native/PointwiseOps.h
@@ -11,10 +11,11 @@ struct TensorIterator;
 namespace native {
 
 using pointwise_fn = void (*)(TensorIterator&, Scalar scalar);
+using pointwise_fn_beta = void (*)(TensorIterator&, Scalar scalar, double beta);
 
 DECLARE_DISPATCH(pointwise_fn, addcmul_stub);
 DECLARE_DISPATCH(pointwise_fn, addcdiv_stub);
-DECLARE_DISPATCH(pointwise_fn, smooth_l1_backward_stub);
+DECLARE_DISPATCH(pointwise_fn_beta, smooth_l1_backward_stub);
 DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);
 
 } // namespace native
diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index 414c8a6f6390..c10a617a5928 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -43,7 +43,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
 }
 
 Tensor& pow_out(Tensor& result, Scalar base, const Tensor& exp) {
-  if (base.toDouble() == 1.0) {
+  if (base.isComplex() && base.toComplexDouble() == 1.0) {
+    result.resize_as_(exp).fill_(1);
+  } else if (!base.isComplex() && base.toDouble() == 1.0) {
     result.resize_as_(exp).fill_(1);
   } else {
     native::pow_out(result, c10::scalar_to_tensor(base, exp.device()), exp);
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 861901521a3b..f18114e73246 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -277,6 +277,18 @@ them the same thing!)
 If two backends have the same dispatch function, you can write `CPU, CUDA: func`
 to reuse the same function name in both cases.
 
+Available backend options can be found at
+https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py#L970.
+In addition to backends above, we also support keyword `Math` which is an alias
+that maps to all backend and autograd backend keys. In other words, function registered to `Math` key
+should be a plain mathematical composition of other `at::` functions and works for any backend.
+
+If you add `dispatch` section to any API that didn't have it before, you **have to** move
+the old implementation to `Math` field so that it's still available for other backends to use.
+
+This work is currently WIP and you can find the design proposal in
+https://github.com/pytorch/pytorch/issues/44680.
+
 ### `device_guard`
 
 ```
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index ffddddfd2ba5..7394365903ed 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -274,33 +274,6 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_impl_cpu(
 
 } // namespace
 
-std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool keepdim) {
-  auto result = [&]() {
-    NoNamesGuard guard;
-    return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim);
-  }();
-  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
-  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
-  return result;
-}
-
-std::tuple<Tensor, Tensor> kthvalue(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool keepdim) {
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  at::kthvalue_out(values, indices, self, k, dim, keepdim);
-  return std::make_tuple(values, indices);
-}
-
 Tensor& quantile_out(
     Tensor& out,
     const Tensor& self,
@@ -395,6 +368,52 @@ Tensor nanquantile(
       self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim);
 }
 
+std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim) {
+  auto result = [&]() {
+    NoNamesGuard guard;
+    return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim);
+  }();
+  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
+  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
+  return result;
+}
+
+std::tuple<Tensor&, Tensor&> kthvalue_out(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    Dimname dim,
+    bool keepdim) {
+  return at::kthvalue_out(
+      values, indices, self, k, dimname_to_position(self, dim), keepdim);
+}
+
+std::tuple<Tensor, Tensor> kthvalue(
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim) {
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::kthvalue_out(values, indices, self, k, dim, keepdim);
+  return std::make_tuple(values, indices);
+}
+
+std::tuple<Tensor, Tensor> kthvalue(
+    const Tensor& self,
+    int64_t k,
+    Dimname dim,
+    bool keepdim) {
+  return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
+}
+
 std::tuple<Tensor&, Tensor&> topk_out_cpu(
     Tensor& values,
     Tensor& indices,
@@ -432,6 +451,33 @@ std::tuple<Tensor, Tensor> topk(
   return std::make_tuple(values, indices);
 }
 
+// this does not reduce to median with dim because we don't want to copy twice
+Tensor median_cpu(const Tensor& self) {
+  NoNamesGuard guard;
+  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
+  if (self.dim() == 0 && self.numel() == 1) {
+    return self.clone(at::MemoryFormat::Contiguous);
+  }
+  auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1);
+  auto result = at::empty({1}, self.options());
+  AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] {
+    // note, quick_select is 0 based while kthvalue is not
+    int64_t k = (tmp_values.size(0) - 1) / 2;
+    auto val_accessor = tmp_values.accessor<scalar_t, 1>();
+    quick_select_template(
+        val_accessor,
+        k,
+        [](scalar_t x, scalar_t y) -> bool {
+          return ((_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
+        },
+        [&](int64_t i, int64_t j) {
+          std::swap(val_accessor[i], val_accessor[j]);
+        });
+    result.fill_(tmp_values[k]);
+  });
+  return result.view({});
+}
+
 std::tuple<Tensor&, Tensor&> median_out(
     Tensor& values,
     Tensor& indices,
@@ -444,16 +490,6 @@ std::tuple<Tensor&, Tensor&> median_out(
   return std::forward_as_tuple(values, indices);
 }
 
-std::tuple<Tensor, Tensor> median(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim) {
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  at::median_out(values, indices, self, dim, keepdim);
-  return std::make_tuple(values, indices);
-}
-
 std::tuple<Tensor&, Tensor&> median_out(
     Tensor& values,
     Tensor& indices,
@@ -466,55 +502,19 @@ std::tuple<Tensor&, Tensor&> median_out(
 
 std::tuple<Tensor, Tensor> median(
     const Tensor& self,
-    Dimname dim,
-    bool keepdim) {
-  return at::median(self, dimname_to_position(self, dim), keepdim);
-}
-
-std::tuple<Tensor&, Tensor&> kthvalue_out(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t k,
-    Dimname dim,
+    int64_t dim,
     bool keepdim) {
-  return at::kthvalue_out(
-      values, indices, self, k, dimname_to_position(self, dim), keepdim);
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::median_out(values, indices, self, dim, keepdim);
+  return std::make_tuple(values, indices);
 }
 
-std::tuple<Tensor, Tensor> kthvalue(
+std::tuple<Tensor, Tensor> median(
     const Tensor& self,
-    int64_t k,
     Dimname dim,
     bool keepdim) {
-  return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
-}
-
-// this does not reduce to median with dim because we don't want to copy twice
-Tensor median_cpu(const Tensor& self) {
-  NoNamesGuard guard;
-  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
-  if (self.dim() == 0 && self.numel() == 1) {
-    return self.clone(at::MemoryFormat::Contiguous);
-  }
-  auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1);
-  auto result = at::empty({1}, self.options());
-  AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] {
-    // note, quick_select is 0 based while kthvalue is not
-    int64_t k = (tmp_values.size(0) - 1) / 2;
-    auto val_accessor = tmp_values.accessor<scalar_t, 1>();
-    quick_select_template(
-        val_accessor,
-        k,
-        [](scalar_t x, scalar_t y) -> bool {
-          return ((_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
-        },
-        [&](int64_t i, int64_t j) {
-          std::swap(val_accessor[i], val_accessor[j]);
-        });
-    result.fill_(tmp_values[k]);
-  });
-  return result.view({});
+  return at::median(self, dimname_to_position(self, dim), keepdim);
 }
 
 std::tuple<Tensor&, Tensor&> sort_out_cpu(
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index e467c21a4a30..21e4d63b163b 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -203,19 +203,129 @@ Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
   return out;
 }
 
+// Dimensions to transform, and the signal shape in those dimensions
+struct ShapeAndDims {
+  DimVector shape, dim;
+};
+
+// Pre-process n-dimensional fft's `s` and `dim` arguments.
+// Wraps dimensions and applies defaulting behavior.
+// Also checks transform dims are unique and transform shape is non-empty.
+ShapeAndDims canonicalize_fft_shape_and_dim_args(
+    Tensor input, c10::optional<IntArrayRef> shape, c10::optional<IntArrayRef> dim) {
+  const int64_t input_dim = input.dim();
+  const IntArrayRef input_sizes = input.sizes();
+  ShapeAndDims ret;
+
+  if (dim) {
+    ret.dim.resize(dim->size());
+    std::copy(dim->begin(), dim->end(), ret.dim.begin());
+    maybe_wrap_dims(ret.dim, input_dim);
+
+    // Check dims are unique
+    DimVector copy = ret.dim;
+    std::sort(copy.begin(), copy.end());
+    auto duplicate = std::adjacent_find(copy.begin(), copy.end());
+    TORCH_CHECK(duplicate == copy.end(), "FFT dims must be unique");
+  }
+
+  if (shape) {
+    // Has shape, may have dim
+    TORCH_CHECK(!dim || dim->size() == shape->size(),
+                "When given, dim and shape arguments must have the same length");
+    TORCH_CHECK(shape->size() <= input_dim,
+                "Got shape with ", shape->size(), " values but input tensor "
+                "only has ", input_dim, " dimensions.");
+    const int64_t transform_ndim = shape->size();
+    // If shape is given, dims defaults to the last shape.size() dimensions
+    if (!dim) {
+      ret.dim.resize(transform_ndim);
+      std::iota(ret.dim.begin(), ret.dim.end(), input_dim - transform_ndim);
+    }
+
+    // Translate shape of -1 to the default length
+    ret.shape.resize(transform_ndim);
+    for (int64_t i = 0; i < transform_ndim; ++i) {
+      const auto n = (*shape)[i];
+      ret.shape[i] = n == -1 ? input_sizes[ret.dim[i]] : n;
+    }
+  } else if (!dim) {
+    // No shape, no dim
+    ret.dim.resize(input_dim);
+    std::iota(ret.dim.begin(), ret.dim.end(), int64_t{0});
+    ret.shape.resize(input_dim);
+    std::copy(input_sizes.begin(), input_sizes.end(), ret.shape.begin());
+  } else {
+    // No shape, has dim
+    ret.shape.resize(ret.dim.size());
+    for (int64_t i = 0; i < ret.dim.size(); ++i) {
+      ret.shape[i] = input_sizes[ret.dim[i]];
+    }
+  }
+
+  for (int64_t i = 0; i < ret.shape.size(); ++i) {
+    TORCH_CHECK(ret.shape[i] > 0,
+                "Invalid number of data points (", ret.shape[i], ") specified");
+  }
+
+  return ret;
+}
+
+// Complex to complex n-dimensional fft
+Tensor fftn_c2c(
+    const Tensor& input, IntArrayRef shape, IntArrayRef dim,
+    c10::optional<std::string> norm_str, bool forward) {
+  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+  const auto input_dim = input.dim();
+
+  Tensor x = resize_fft_input(input, dim, shape);
+  x = at::view_as_real(x);
+
+  const int64_t transform_ndim = dim.size();
+  const auto norm = norm_from_string(norm_str, forward);
+  // _fft_with_size only supports 3 dimensions being transformed at a time.
+  // This limit is inherited from cuFFT.
+  constexpr int64_t max_signal_ndim = 3;
+
+  // Transform n dimensions, up to 3 at a time
+  // TODO: rewrite _fft_with_size to transform more than 3 dimensions at once.
+  for (int64_t i = 0; i < transform_ndim; i += max_signal_ndim) {
+    const int64_t signal_ndim = std::min(transform_ndim - i, max_signal_ndim);
+    DimVector source_dim(signal_ndim);
+    DimVector dest_dim(signal_ndim);
+
+    for (int64_t j = 0; j < signal_ndim; ++j) {
+      source_dim[j] = dim[i + j];
+      dest_dim[j] = j + (input_dim - signal_ndim);
+    }
+
+    // _fft operates on up-to the last 3 dims, so move selected dims to the end
+    x = at::movedim(x, source_dim, dest_dim);
+
+    x = _fft(x, signal_ndim, /*complex_input=*/true, /*complex_output=*/true,
+             /*inverse=*/!forward, /*signal_sizes=*/{}, /*normalization=*/norm,
+             /*onesided=*/false);
+
+    // Move transform dims back to their original order
+    x = at::movedim(x, dest_dim, source_dim);
+  }
+
+  return at::view_as_complex(x);
+}
+
 }
 
 // torch.fft.fft, analogous to NumPy's numpy.fft.fft
 Tensor fft_fft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                c10::optional<std::string> norm) {
-  return self.is_complex() ? 
+  return self.is_complex() ?
     fft_c2c(self, n, dim, norm, /*forward=*/true) :
     fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
 }
 
 Tensor fft_ifft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return self.is_complex() ? 
+  return self.is_complex() ?
     fft_c2c(self, n, dim, norm, /*forward=*/false) :
     fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
 }
@@ -240,6 +350,128 @@ Tensor fft_ihfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
   return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
 }
 
+Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  // TODO: For real input, perform rfftn then mirror with conjugate symmetry
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false);
+}
+
+Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis");
+
+  const auto last_dim = desc.dim.back();
+  const auto last_shape = desc.shape.back();
+  desc.shape.pop_back();
+  desc.dim.pop_back();
+
+  // rfft on last dim to get hermitian complex shape
+  auto x = native::fft_rfft(self, last_shape, last_dim, norm);
+  // Normal fft on remaining dims
+  return fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis");
+
+  const auto last_dim = desc.dim.back();
+  const auto last_shape = [&]() -> c10::optional<int64_t> {
+    // If shape is defaulted in the last dimension,
+    // pass nullopt to irfft and let it calculate the default size
+    if (!s.has_value() || (s->back() == -1)) {
+      return c10::nullopt;
+    }
+    return desc.shape.back();
+  }();
+  desc.shape.pop_back();
+  desc.dim.pop_back();
+
+  // Normal ifft for all but last dim
+  Tensor x = promote_tensor_fft(self, /*require_complex=*/true);
+   x = fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/false);
+  // Then 1d irfft on last dim to get real output
+  return native::fft_irfft(x, last_shape, last_dim, norm);
+}
+
+Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "fftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n, options);
+  auto right_slice = result.slice(0, (n + 1) / 2, 0);
+  at::arange_out(right_slice, -(n/2), 0, 1);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "rfftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n/2 + 1, options);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+// If an array dim is specified, wraps them according to self.dim().
+// Otherwise returns a vector of all dims.
+DimVector default_alldims(const Tensor& self, c10::optional<IntArrayRef> dim_opt) {
+  DimVector dim;
+  if (dim_opt) {
+    IntArrayRef dim_unwrapped = *dim_opt;
+    dim.resize(dim_unwrapped.size());
+    for (int64_t i = 0; i < dim.size(); ++i) {
+      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim());
+    }
+  } else {
+    dim.resize(self.dim());
+    std::iota(dim.begin(), dim.end(), 0);
+  }
+  return dim;
+}
+
+Tensor fft_fftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = x_sizes[dim[i]] / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
+Tensor fft_ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = (x_sizes[dim[i]] + 1) / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
 
 // This is a pass-through wrapper function that does the size check and
 // inferences. The actual forward implementation function is called
@@ -393,6 +625,10 @@ void _cufft_clear_plan_cache(int64_t device_index) {
 }
 
 Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  TORCH_WARN_ONCE(
+    "The function torch.fft is deprecated and will be removed in PyTorch 1.8. "
+    "Use the new torch.fft module functions, instead, by importing torch.fft "
+    "and calling torch.fft.fft or torch.fft.fftn.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
@@ -400,6 +636,10 @@ Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized)
 }
 
 Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  TORCH_WARN_ONCE(
+    "The function torch.ifft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.ifft or torch.fft.ifftn.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ true, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
@@ -408,6 +648,10 @@ Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized
 
 Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
             const bool onesided) {
+  TORCH_WARN_ONCE(
+    "The function torch.rfft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.fft or torch.fft.rfft.");
   return _fft(self, signal_ndim, /* complex_input */ false,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
@@ -416,6 +660,10 @@ Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized
 
 Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
              const bool onesided,  IntArrayRef signal_sizes) {
+  TORCH_WARN_ONCE(
+    "The function torch.irfft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.ifft or torch.fft.irfft.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ false, /* inverse */ true, signal_sizes,
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
@@ -463,8 +711,10 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   const bool return_complex = return_complexOpt.value_or(
       self.is_complex() || (window.defined() && window.is_complex()));
   if (!return_complexOpt && !return_complex) {
-    TORCH_WARN("stft will return complex tensors by default in future, use"
-               " return_complex=False to preserve the current output format.");
+    TORCH_WARN_ONCE("stft will require the return_complex parameter be explicitly "
+                    " specified in a future PyTorch release. Use return_complex=False "
+                    " to preserve the current behavior or return_complex=True to return "
+                    " a complex output.");
   }
 
   if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index ad6625308ff5..bc58ba8e6eec 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -135,6 +135,26 @@ static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t
   return index.reshape(shape);
 }
 
+static ptrdiff_t dataOffset(const Tensor& tensor, ptrdiff_t linearIndex) {
+  auto size = tensor.sizes();
+  auto stride = tensor.strides();
+  int nDim = tensor.dim();
+  ptrdiff_t dataOffset = 0;
+  for (int i = nDim - 1; i >= 0; i--) {
+    dataOffset += (linearIndex % size[i]) * stride[i];
+    linearIndex /= size[i];
+  }
+  return dataOffset;
+}
+
+static inline int64_t wrapLinearIndex(int64_t linearIndex, int64_t numel) {
+  return linearIndex < 0 ? linearIndex + numel : linearIndex;
+}
+
+static inline void checkLinearIndex(int64_t linearIndex, int64_t numel) {
+  TORCH_CHECK(linearIndex < numel && linearIndex >= -numel, "out of range: ", linearIndex, " out of ", numel);
+}
+
 AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
 {
   int64_t element_size_bytes = src.element_size();
@@ -815,6 +835,77 @@ Tensor masked_select_backward(const Tensor& grad, const Tensor& input, const Ten
   return result.masked_scatter_(mask, grad);
 }
 
+void take_out_cpu_template(
+    Tensor& output,
+    Tensor const& input,
+    Tensor const& index)
+{
+    TORCH_CHECK(output.device().type() == at::kCPU, "device type of output (", output.device().type(), ") is not CPU");
+    TORCH_CHECK(input.device().type() == at::kCPU, "device type of input (", input.device().type(), ") is not CPU");
+    TORCH_CHECK(index.device().type() == at::kCPU, "device type of index (", index.device().type(), ") is not CPU");
+
+    TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            output.layout(), " on output tensor");
+    TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            input.layout(), " on input tensor");
+    TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            index.layout(), " on index tensor");
+
+    TORCH_CHECK(output.scalar_type() == input.scalar_type(), "output and input scalar type must match.",
+            "But got different types: ", output.scalar_type(), " and ", input.scalar_type());
+    TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor");
+
+    output.resize_(index.sizes());
+    auto output_contiguous = output.contiguous();
+    auto index_continuous = index.contiguous();
+    bool is_contiguous = input.is_contiguous();
+    auto input_size = input.numel();
+
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cpu", [&] {
+        auto output_data = output_contiguous.data_ptr<scalar_t>();
+        auto input_data = input.data_ptr<scalar_t>();
+        auto index_data = index.data_ptr<int64_t>();
+
+        // Exceptions must not be thrown across parallel sections, so we
+        // record the position of the invalid index and throw the exception after the
+        // loop.
+        std::atomic<int64_t> invalidIdxPos(-1);
+
+        at::parallel_for(0, index.numel(), at::internal::GRAIN_SIZE,
+            [&](int64_t start, int64_t end) {
+            for (auto i = start; i < end; i++) {
+                int64_t idx = index_data[i];
+                if (idx < input_size && idx >= -input_size) {
+                    idx = wrapLinearIndex(idx, input_size);
+                    if (is_contiguous) {
+                        output_data[i] = input_data[idx];
+                    } else {
+                        output_data[i] = input_data[dataOffset(input, idx)];
+                    }
+                } else {
+                    int64_t tmp = -1;
+                    invalidIdxPos.compare_exchange_strong(tmp, i);
+                }
+            }
+        });
+
+        if (invalidIdxPos >= 0) {
+            checkLinearIndex(index_data[invalidIdxPos], input_size);
+        }
+    });
+}
+
+Tensor take_cpu(const Tensor& self, const Tensor& index) {
+    auto output = at::empty(index.sizes(), self.options());
+    take_out_cpu_template(output, self, index);
+    return output;
+}
+
+Tensor& take_out_cpu(Tensor& out, const Tensor& self, const Tensor& index) {
+    take_out_cpu_template(out, self, index);
+    return out;
+}
+
 Tensor take_backward(const Tensor& grad, const Tensor& input, const Tensor& index) {
   return at::zeros_like(input).put_(index, grad, true);
 }
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 2764490f6d48..e2b5639f8dc9 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -162,7 +162,10 @@ Tensor& abs_out(Tensor& result, const Tensor& self) {
 Tensor abs(const Tensor& self) {
   return unary_op_impl_with_complex_to_float(self, at::abs_out);
 }
-Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); }
+Tensor& abs_(Tensor& self) {
+  TORCH_CHECK(!self.is_complex(), "In-place abs is not supported for complex tensors.");
+  return unary_op_impl_(self, at::abs_out);
+}
 
 // Absolute, alias for abs
 Tensor& absolute_out(Tensor& result, const Tensor& self) {
@@ -301,6 +304,17 @@ Tensor& sign_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(
 Tensor sign(const Tensor& self) { return unary_op_impl(self, at::sign_out); }
 Tensor& sign_(Tensor& self) { return unary_op_impl_(self, at::sign_out); }
 
+Tensor& sgn_out(Tensor& result, const Tensor& self) {
+  if (self.is_complex()) {
+    return unary_op_impl_out(result, self, sgn_stub);
+  } else {
+    return unary_op_impl_out(result, self, sign_stub);
+  }
+}
+
+Tensor sgn(const Tensor& self) { return unary_op_impl(self, at::sgn_out); }
+Tensor& sgn_(Tensor& self) { return unary_op_impl_(self, at::sgn_out); }
+
 Tensor& sin_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sin_stub); }
 Tensor sin(const Tensor& self) { return unary_op_impl(self, at::sin_out); }
 Tensor& sin_(Tensor& self) { return unary_op_impl_(self, at::sin_out); }
@@ -373,6 +387,41 @@ Tensor& logit_(Tensor& self, c10::optional<double> eps) {
   return at::logit_out(self, self, eps);
 }
 
+Tensor& nan_to_num_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+
+  if (c10::isIntegralType(self.scalar_type())) {
+    result.resize_as_(self);
+    result.copy_(self);
+    return result;
+  }
+
+  auto iter = TensorIterator::unary_op(result, self);
+  nan_to_num_stub(iter.device_type(), iter, nan, pos_inf, neg_inf);
+  return result;
+}
+
+Tensor nan_to_num(
+    const Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  auto result = at::empty_like(self);
+  return at::nan_to_num_out(result, self, nan, pos_inf, neg_inf);
+}
+
+Tensor& nan_to_num_(
+    Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf);
+}
+
 Tensor& tanh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, tanh_stub); }
 Tensor tanh(const Tensor& self) { return unary_op_impl(self, at::tanh_out); }
 Tensor& tanh_(Tensor& self) { return unary_op_impl_(self, at::tanh_out); }
@@ -405,9 +454,9 @@ Tensor& neg_out(Tensor& result, const Tensor& self) {
 Tensor neg(const Tensor& self) { return unary_op_impl(self, at::neg_out); }
 Tensor& neg_(Tensor& self) { return unary_op_impl_(self, at::neg_out); }
 
-Tensor& negative_out(Tensor& result, const Tensor& self) { return at::native::neg_out(result, self); }
-Tensor negative(const Tensor& self) { return at::native::neg(self); }
-Tensor& negative_(Tensor& self) { return at::native::neg_(self); }
+Tensor& negative_out(Tensor& result, const Tensor& self) { return at::neg_out(result, self); }
+Tensor negative(const Tensor& self) { return self.neg(); }
+Tensor& negative_(Tensor& self) { return self.neg_(); }
 
 Tensor logical_not(const Tensor& self) {
   Tensor result = at::empty({0}, self.options().dtype(kBool));
@@ -631,6 +680,7 @@ DEFINE_DISPATCH(log1p_stub);
 DEFINE_DISPATCH(log2_stub);
 DEFINE_DISPATCH(logical_not_stub);
 DEFINE_DISPATCH(neg_stub);
+DEFINE_DISPATCH(nan_to_num_stub);
 DEFINE_DISPATCH(polygamma_stub);
 DEFINE_DISPATCH(reciprocal_stub);
 DEFINE_DISPATCH(round_stub);
@@ -639,6 +689,7 @@ DEFINE_DISPATCH(sigmoid_stub);
 DEFINE_DISPATCH(logit_stub);
 DEFINE_DISPATCH(sign_stub);
 DEFINE_DISPATCH(signbit_stub);
+DEFINE_DISPATCH(sgn_stub);
 DEFINE_DISPATCH(sin_stub);
 DEFINE_DISPATCH(sinh_stub);
 DEFINE_DISPATCH(sqrt_stub);
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index fa172cb58b38..a6db47f17153 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -53,6 +53,7 @@ DECLARE_DISPATCH(unary_fn, sigmoid_stub);
 DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
 DECLARE_DISPATCH(unary_fn, sign_stub);
 DECLARE_DISPATCH(unary_fn, signbit_stub);
+DECLARE_DISPATCH(unary_fn, sgn_stub);
 DECLARE_DISPATCH(unary_fn, sin_stub);
 DECLARE_DISPATCH(unary_fn, sinh_stub);
 DECLARE_DISPATCH(unary_fn, sqrt_stub);
@@ -76,6 +77,13 @@ DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_stub
 DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub);
 DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar a, Scalar b), clamp_stub);
 DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, int64_t, bool, c10::optional<Generator>), multinomial_stub);
+DECLARE_DISPATCH(
+    void (*)(
+        TensorIterator&,
+        c10::optional<double>,
+        c10::optional<double>,
+        c10::optional<double>),
+    nan_to_num_stub);
 
 // Missing unary functions
 // digamma
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index d06c27f69e3d..7d5cea725cf1 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -12,7 +12,7 @@ void backward(const Tensor& self, const Tensor& gradient, c10::optional<bool> ke
   AT_ERROR("backward is not implemented for Tensor");
 }
 
-void set_data(const Tensor& self, const Tensor& new_data) {
+void set_data(Tensor& self, const Tensor& new_data) {
   AT_ERROR("set_data is not implemented for Tensor");
 }
 
@@ -36,7 +36,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
   AT_ERROR("requires_grad_ is not implemented for Tensor");
 }
 
-void retain_grad(const Tensor& self) {
+void retain_grad(Tensor& self) {
   AT_ERROR("retain_grad is not implemented for Tensor");
 }
 
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 09847a010ee3..fce8c348919b 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -237,14 +237,14 @@ void logical_and_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return a && b;
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(a && b);
@@ -257,14 +257,14 @@ void logical_or_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return a || b;
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(a || b);
@@ -277,14 +277,14 @@ void logical_xor_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return bool(a) != bool(b);
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(bool(a) != bool(b));
@@ -502,24 +502,25 @@ void minimum_kernel(TensorIterator& iter) {
   }
 }
 
-void smooth_l1_kernel(TensorIterator& iter) {
+void smooth_l1_kernel(TensorIterator& iter, double beta) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
         kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
         using Vec = Vec256<scalar_t>;
-        const Vec one_vec(static_cast<scalar_t>(1));
+        const scalar_t beta_val(beta);
+        const Vec beta_val_vec(beta_val);
         const Vec point_five_vec(static_cast<scalar_t>(0.5));
         cpu_kernel_vec(
             iter,
-            [](scalar_t a, scalar_t b) -> scalar_t {
+            [&beta_val](scalar_t a, scalar_t b) -> scalar_t {
               auto z = std::abs(a - b);
-              return z < static_cast<scalar_t>(1)
-                  ? static_cast<scalar_t>(0.5) * z * z
-                  : z - static_cast<scalar_t>(0.5);
+              return z < beta_val
+                  ? static_cast<scalar_t>(0.5) * z * z / beta_val
+                  : z - static_cast<scalar_t>(0.5) * beta_val;
             },
-            [&one_vec, &point_five_vec](Vec a, Vec b) {
+            [&beta_val_vec, &point_five_vec](Vec a, Vec b) {
               auto z = (a - b).abs();
               return Vec::blendv(
-                  point_five_vec * z * z, z - point_five_vec, z >= one_vec);
+                  point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
             });
       });
 }
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 114ca93dae26..34911a2975e4 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -104,7 +104,11 @@ struct Dist {
 
   // Special general pnorm derivative if p is less than two
   struct lttdist_calc {
-    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) {
+      Vec result = (dist == 0.0) ? Vec(0) : (sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)));
+      result = Vec::blendv(result, Vec(0), (diff == Vec(0)) & (p < Vec(1)));
+      return result;
+    }
   };
 
   // Two norm
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index 45c803e0fec2..4a52178972fc 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -46,28 +46,39 @@ static void addcdiv_cpu_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm) {
+static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm, double beta) {
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
+    scalar_t beta_val(beta);
     auto norm_val_vec = Vec256<scalar_t>(norm_val);
+    auto beta_val_vec = Vec256<scalar_t>(beta_val);
     const auto neg_1_vec = Vec256<scalar_t>(-1);
+    const auto zero_vec = Vec256<scalar_t>(0);
     const auto pos_1_vec = Vec256<scalar_t>(1);
     cpu_kernel_vec(iter,
       [=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < -1.)
+        if (x <= -beta)
           return -norm_val * grad_output;
-        else if (x > 1.)
+        else if (x >= beta)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta;
       },
-      [norm_val_vec, neg_1_vec, pos_1_vec](
+      [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
          Vec256<scalar_t> input, Vec256<scalar_t> target, Vec256<scalar_t> grad_output) -> Vec256<scalar_t> {
-        auto x = input - target;
-        x = clamp(x, neg_1_vec, pos_1_vec);
-        return norm_val_vec * x * grad_output;
+        // using two blendv calls to simulate the 3 cases
+        // 1        if  x >= beta
+        // -1       if x <= -beta
+        // x / beta if |x| < beta
+        const auto x = input - target;
+        const auto pos_or_neg_1_vec = Vec256<scalar_t>::blendv(
+            neg_1_vec, pos_1_vec, x > zero_vec);
+        const auto x_abs = x.abs();
+        const auto output = Vec256<scalar_t>::blendv(
+            x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
+        return norm_val_vec * output * grad_output;
       }
     );
   });
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index adf300522692..84c3ceed3a23 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -213,11 +213,14 @@ static void bitwise_not_kernel(TensorIterator& iter) {
           });
   } else {
     AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_not_cpu", [&]() {
-      cpu_kernel(
+      cpu_kernel_vec(
           iter,
           [](scalar_t a) -> scalar_t {
             return ~a;
-      });
+          },
+          [](Vec256<scalar_t> a) -> Vec256<scalar_t> {
+            return ~a;
+          });
     });
   }
 }
@@ -235,9 +238,9 @@ static void logical_not_kernel(TensorIterator& iter) {
   // NOTE: this implementation differs from the CUDA implementation which only does single dispatch
   // (to avoid expensive compilation) because CPU kernels don't handle dynamic_casting
   // (see needs_dynamic_casting).
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cpu", [&]() {
     using self_t = scalar_t;
-    AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cpu", [&]() {
       cpu_kernel(iter, [](self_t a) -> scalar_t { return static_cast<scalar_t>(!a); });
     });
   });
@@ -270,16 +273,16 @@ static void sign_kernel(TensorIterator& iter){
         auto one_vec = Vec256<scalar_t>(static_cast<scalar_t>(1));
 
         cpu_kernel_vec(
-            iter,
-            [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); },
-            [=](Vec256<scalar_t> self_vec){
+          iter,
+          [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); },
+          [=](Vec256<scalar_t> self_vec){
 
-                // Comparision operators returns bitmask.
-                auto left = Vec256<scalar_t>::blendv(zero_vec, one_vec, zero_vec < self_vec);
-                auto right = Vec256<scalar_t>::blendv(zero_vec, one_vec, self_vec < zero_vec);
+              // Comparision operators returns bitmask.
+              auto left = Vec256<scalar_t>::blendv(zero_vec, one_vec, zero_vec < self_vec);
+              auto right = Vec256<scalar_t>::blendv(zero_vec, one_vec, self_vec < zero_vec);
 
-                return left - right;
-            });
+              return left - right;
+          });
     });
   }
 }
@@ -290,6 +293,15 @@ static void signbit_kernel(TensorIterator& iter){
   });
 }
 
+static void sgn_kernel(TensorIterator& iter){
+  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), 'sgn_cpu', [&]() {
+    cpu_kernel_vec(
+      iter,
+      [=](scalar_t a) -> scalar_t { return sgn_impl(a); },
+      [=](Vec256<scalar_t> a) { return a.sgn(); });
+  });
+}
+
 static void sinh_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
     cpu_kernel_vec(
@@ -371,6 +383,33 @@ static void polygamma_kernel(TensorIterator& iter, int64_t n) {
   }
 }
 
+static void nan_to_num_kernel(
+    TensorIterator& iter,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "nan_to_num", [&]() {
+    scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+    scalar_t pos_inf_replacement = pos_inf.has_value()
+        ? static_cast<scalar_t>(pos_inf.value())
+        : std::numeric_limits<scalar_t>::max();
+    scalar_t neg_inf_replacement = neg_inf.has_value()
+        ? static_cast<scalar_t>(neg_inf.value())
+        : std::numeric_limits<scalar_t>::lowest();
+
+    cpu_kernel(iter, [=](scalar_t a) -> scalar_t {
+      return (
+          at::_isnan(a)
+              ? nan_replacement
+              : (a == std::numeric_limits<scalar_t>::infinity()
+                     ? pos_inf_replacement
+                     : (a == -std::numeric_limits<scalar_t>::infinity()
+                            ? neg_inf_replacement
+                            : a)));
+    });
+  });
+}
+
 static void clamp_kernel(TensorIterator& iter, Scalar min_scalar, Scalar max_scalar) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "clamp_cpu", [&]() {
     c10::scalar_value_type<scalar_t>::type (*zabs_)(scalar_t) = zabs;
@@ -636,9 +675,11 @@ REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel);
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel);
 REGISTER_DISPATCH(frac_stub, &frac_kernel);
 REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel);
+REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel);
 REGISTER_DISPATCH(neg_stub, &neg_kernel);
 REGISTER_DISPATCH(sign_stub, &sign_kernel);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel);
+REGISTER_DISPATCH(sgn_stub, &sgn_kernel);
 REGISTER_DISPATCH(sinh_stub, &sinh_kernel);
 REGISTER_DISPATCH(cosh_stub, &cosh_kernel);
 REGISTER_DISPATCH(acosh_stub, &acosh_kernel);
@@ -669,7 +710,7 @@ IMPLEMENT_COMPLEX_KERNEL(FLOATING, log10)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, log2)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, i0)
-IMPLEMENT_COMPLEX_KERNEL(FLOATING, round)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, round)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, sin)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, sqrt)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, tan)
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
index d6816f4dd182..e0554e0cbc29 100644
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@@ -138,6 +138,15 @@ inline c10::complex<double> ceil_impl (c10::complex<double> z) {
   return c10::complex<double>(std::ceil(z.real()), std::ceil(z.imag()));
 }
 
+template<typename T>
+inline c10::complex<T> sgn_impl (c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / zabs(z);
+  }
+}
+
 template <typename TYPE>
 inline TYPE floor_impl (TYPE z) {
   return std::floor(z);
diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 4113115d7b12..649b235bf654 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -6,11 +6,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct AbsFunctor {
+  __device__ __forceinline__ scalar_t operator() (const scalar_t a) const {
+    return std::abs(a);
+  }
+};
+
 void abs_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return std::abs(a);
-    });
+    gpu_kernel(iter, AbsFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index 0d8b87f402de..7f5966739c21 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -3,9 +3,13 @@
 #include <math.h>
 
 #include <ATen/ATen.h>
+#include <ATen/DeviceGuard.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/TensorIterator.h>
+
 
 namespace {
 // Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
@@ -33,49 +37,136 @@ static __host__ __device__ __forceinline__ int isfinite_ensure_cuda_math(float v
 namespace at {
 namespace native {
 
-// Multiplies scaled_grad in-place by inv_scale.  If an element of scaled_grad was inf or NaN sets found_inf to 1.0.
-//
-// Args:
-// scaled_grad:  A (scaled) gradient tensor.  May contain infs or NaNs.
-// found_inf:  A single-element float tensor to which 1.0 will be written if any gradients contain infs/nans.
-//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
-// inv_scale:  The inverse of the scale factor by which scaled_grad is currently multiplied.
-//
-// Returns:
-// A tuple with references to scaled_grad, which is now unscaled in place, and found_inf,
-// which is now guaranteed to contain 1.0 if an inf or NaN was found in scaled_grad.
+namespace {
+// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_cuda_.
+// Handles individual tensors that are acceptable to unscale but not MTA-safe.
 void _amp_non_finite_check_and_unscale_cuda_(Tensor& scaled_grad,
                                              Tensor& found_inf,
                                              const Tensor& inv_scale)
 {
-  TORCH_CHECK(scaled_grad.is_cuda(), "scaled_grad must be a CUDA tensor.");
+  // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_cuda_, so no input checks.
+
+  // It's not obvious gpu_kernel always guards onto its argument.  Guarding here just in case.
+  const OptionalDeviceGuard device_guard(device_of(scaled_grad));
+
+  // Acts on scaled_grad in place.
+  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    iter.dtype(),
+    "_amp_non_finite_check_and_unscale_cuda",
+    [&iter, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.data_ptr<float>();
+
+      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+
+      gpu_kernel(iter,
+                 [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
+                   auto val = static_cast<opmath_t>(val_in);
+                   if (!isfinite_ensure_cuda_math(val)) {
+                     *found_inf_ptr = 1.f;
+                   }
+                   // Every thread accesses inv_scale, but it will hit in cache.
+                   const auto inv_scale_val = *inv_scale_ptr;
+                   return static_cast<scalar_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                 });
+    });
+}
+} // anonymous namespace
+
+
+// Multiplies each tensor in scaled_grads by inv_scale in-place.
+// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0.
+// Uses multi tensor apply (MTA) to process all MTA-safe tensors.
+//
+// Args:
+// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or NaNs.
+// found_inf:  A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans.
+//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
+// inv_scale:  The inverse of the scale factor by which scaled_grads are currently multiplied.
+void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
+                                                     Tensor& found_inf,
+                                                     const Tensor& inv_scale)
+{
+  if (scaled_grads.size() == 0) {
+    return;
+  }
+
   TORCH_CHECK(inv_scale.is_cuda(), "inv_scale must be a CUDA tensor.");
   TORCH_CHECK(found_inf.is_cuda(), "found_inf must be a CUDA tensor.");
   TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
   TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
   TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
   TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
-  TORCH_CHECK(scaled_grad.layout() == at::kStrided, "scaled_grad must be a strided (not sparse) Tensor.");
 
-  // Act on scaled_grad in place.
-  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
+  check_foreach_api_restrictions(scaled_grads);
+
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+
+  // is_non_overlapping_and_dense() is not available in Python.
+  // GradScaler can't filter for it. We need to filter here.
+  if (can_use_fast_route(scaled_grads)) {
+    // Hopefully common case.
+    // can_use_fast_route is true, which confirms:
+    //  - all scaled_grads are strided
+    //  - all scaled_grads are non overlapping and dense
+    //  - all scaled_grads are on the same device
+    TORCH_CHECK(scaled_grads[0].is_cuda(), "scaled_grads must be CUDA tensors.");
+    // Sets up MTA launch to use scaled_grads as-is.
+    tensor_lists.emplace_back(scaled_grads.vec());
+  } else {
+    // Hopefully uncommon case.
+    // can_use_fast_route is an all-or-nothing check.  In this path it was false,
+    // so any of the above confirmations could have gone wrong.
+    // We filter MTA-safe tensors into an MTA-able list.
+    // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel.
+    // If a tensor is unacceptable, we throw an error to blame GradScaler.
+    tensor_lists.resize(1);
+    tensor_lists[0].reserve(scaled_grads.size());
+    auto expected_device = scaled_grads[0].device();
+    for (const Tensor& t : scaled_grads) {
+      // Ensures GradScaler filtered scaled_grads by device.
+      TORCH_CHECK(t.is_cuda(), "one of scaled_grads was not a CUDA tensor.");
+      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
+      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
+      if (!t.is_non_overlapping_and_dense()) {
+        // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
+        _amp_non_finite_check_and_unscale_cuda_(const_cast<Tensor&>(t),
+                                                found_inf,
+                                                inv_scale);
+      } else {
+        tensor_lists[0].push_back(t);
+      }
+    }
+    if (tensor_lists[0].size() == 0) {
+      return;
+    }
+  }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    iter.dtype(),
-    "_amp_non_finite_check_and_unscale_cuda",
-    [&iter, &found_inf, &inv_scale] {
+    tensor_lists[0][0].scalar_type(),
+    "_amp_foreach_non_finite_check_and_unscale_cuda",
+    [&tensor_lists, &found_inf, &inv_scale] {
       auto* found_inf_ptr = found_inf.data_ptr<float>();
       auto* inv_scale_ptr = inv_scale.data_ptr<float>();
 
-      gpu_kernel(iter, [found_inf_ptr, inv_scale_ptr]GPU_LAMBDA(scalar_t val) -> scalar_t {
-          float fval = static_cast<float>(val);
-          // See isfinite_ensure_cuda_math above.
-          if (!isfinite_ensure_cuda_math(fval)) {
-            *found_inf_ptr = 1.f;
-          }
-          const auto inv_scale_val = *inv_scale_ptr; // Every thread accesses inv_scale, but it will hit in cache.
-          return static_cast<scalar_t>(inv_scale_val == 1.f ? fval : fval*inv_scale_val);
-        });
+      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+
+      // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
+      multi_tensor_apply<1>(tensor_lists,
+                            UnaryOpFunctor_<scalar_t>(),
+                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
+                              // There is a slight asymmetry here with the TensorIterator kernel above.
+                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
+                              if (!isfinite_ensure_cuda_math(val)) {
+                                *found_inf_ptr = 1.f;
+                              }
+                              // Every thread accesses inv_scale, but it will hit in cache.
+                              const auto inv_scale_val = *inv_scale_ptr;
+                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                            });
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 5394c2a23239..e9dfe2d9285d 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -9,6 +9,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/cuda/BatchLinearAlgebraLib.h>
+#include <ATen/native/cpu/zmath.h>
 
 #include <THC/THC.h> // for USE_MAGMA
 
@@ -116,17 +117,18 @@ void magmaOrgqr(
     magma_int_t m, magma_int_t n, magma_int_t k, scalar_t* dA,
     magma_int_t ldda, scalar_t* tau, scalar_t* dT, magma_int_t nb, magma_int_t* info);
 
-template<class scalar_t>
+template<class scalar_t, class value_t=scalar_t>
 void magmaSymeig(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, scalar_t* dA, magma_int_t ldda,
-    scalar_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info);
+    value_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork, value_t* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info);
 
-template<class scalar_t>
+template<class scalar_t, class value_t=scalar_t>
 void magmaSvd(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, scalar_t* A,
-    magma_int_t lda, scalar_t* s, scalar_t* U, magma_int_t ldu,
+    magma_int_t lda, value_t* s, scalar_t* U, magma_int_t ldu,
     scalar_t* VT, magma_int_t ldvt, scalar_t* work, magma_int_t lwork,
+    value_t* rwork,
     magma_int_t* iwork, magma_int_t* info);
 
 template<class scalar_t>
@@ -344,6 +346,24 @@ void magmaCholesky<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholesky<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<double>* dA,
+    magma_int_t ldda, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zpotrf_gpu(uplo, n, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholesky<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<float>* dA,
+    magma_int_t ldda, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cpotrf_gpu(uplo, n, reinterpret_cast<magmaFloatComplex*>(dA), ldda, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaCholeskyBatched<double>(
     magma_uplo_t uplo, magma_int_t n, double** dA_array, magma_int_t ldda,
@@ -360,6 +380,22 @@ void magmaCholeskyBatched<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholeskyBatched<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<double>** dA_array, magma_int_t ldda,
+    magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  magma_zpotrf_batched(uplo, n, reinterpret_cast<magmaDoubleComplex**>(dA_array), ldda, info_array, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholeskyBatched<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<float>** dA_array, magma_int_t ldda,
+    magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  magma_cpotrf_batched(uplo, n, reinterpret_cast<magmaFloatComplex**>(dA_array), ldda, info_array, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaTriangularSolve<double>(
     magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
@@ -453,8 +489,10 @@ void magmaOrgqr<float>(
 template<>
 void magmaSymeig<double>(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double* dA, magma_int_t ldda,
-    double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+    double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork, double* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  (void)rwork;  // unused
+  (void)lrwork;  // unused
   MagmaStreamSyncGuard guard;
   magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
@@ -463,19 +501,46 @@ void magmaSymeig<double>(
 template<>
 void magmaSymeig<float>(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float* dA, magma_int_t ldda,
-    float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+    float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork, float* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  (void)rwork;  // unused
+  (void)lrwork;  // unused
   MagmaStreamSyncGuard guard;
   magma_ssyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaSymeig<c10::complex<double>, double>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex<double>* dA, magma_int_t ldda,
+    double* w, c10::complex<double>* wA, magma_int_t ldwa, c10::complex<double>* work, magma_int_t lwork, double* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zheevd_gpu(
+      jobz, uplo, n, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, w, reinterpret_cast<magmaDoubleComplex*>(wA),
+      ldwa, reinterpret_cast<magmaDoubleComplex*>(work), lwork, rwork, lrwork, iwork, liwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaSymeig<c10::complex<float>, float>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex<float>* dA, magma_int_t ldda,
+    float* w, c10::complex<float>* wA, magma_int_t ldwa, c10::complex<float>* work, magma_int_t lwork, float* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cheevd_gpu(
+      jobz, uplo, n, reinterpret_cast<magmaFloatComplex*>(dA), ldda, w, reinterpret_cast<magmaFloatComplex*>(wA),
+      ldwa, reinterpret_cast<magmaFloatComplex*>(work), lwork, rwork, lrwork, iwork, liwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaSvd<double>(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, double* A,
     magma_int_t lda, double* s, double* U, magma_int_t ldu,
     double* VT, magma_int_t ldvt, double* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t* info) {
+    double *rwork, magma_int_t* iwork, magma_int_t* info) {
+  (void)rwork; // unused
   MagmaStreamSyncGuard guard;
   magma_dgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
@@ -486,12 +551,43 @@ void magmaSvd<float>(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, float* A,
     magma_int_t lda, float* s, float* U, magma_int_t ldu,
     float* VT, magma_int_t ldvt, float* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t* info) {
+    float* rwork, magma_int_t* iwork, magma_int_t* info) {
+  (void)rwork; // unused
   MagmaStreamSyncGuard guard;
   magma_sgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaSvd<c10::complex<float>, float>(
+    magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex<float>* A,
+    magma_int_t lda, float* s, c10::complex<float>* U, magma_int_t ldu,
+    c10::complex<float>* VT, magma_int_t ldvt, c10::complex<float>* work, magma_int_t lwork,
+    float *rwork, magma_int_t* iwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cgesdd(jobz, m, n, reinterpret_cast<magmaFloatComplex*>(A), lda, s,
+                reinterpret_cast<magmaFloatComplex*>(U), ldu,
+                reinterpret_cast<magmaFloatComplex*>(VT), ldvt,
+                reinterpret_cast<magmaFloatComplex*>(work), lwork,
+                rwork, iwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaSvd<c10::complex<double>, double>(
+    magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex<double>* A,
+    magma_int_t lda, double* s, c10::complex<double>* U, magma_int_t ldu,
+    c10::complex<double>* VT, magma_int_t ldvt, c10::complex<double>* work, magma_int_t lwork,
+    double *rwork, magma_int_t* iwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zgesdd(jobz, m, n, reinterpret_cast<magmaDoubleComplex*>(A), lda, s,
+                reinterpret_cast<magmaDoubleComplex*>(U), ldu,
+                reinterpret_cast<magmaDoubleComplex*>(VT), ldvt,
+                reinterpret_cast<magmaDoubleComplex*>(work), lwork,
+                rwork, iwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaLuSolve<double>(
     magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda, magma_int_t* ipiv,
@@ -904,7 +1000,7 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
     self_working_copy = cloneBatchedColumnMajor(self);
   }
 
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_cuda", [&]{
     apply_cholesky<scalar_t>(self_working_copy, false, infos);
   });
   if (self.dim() > 2) {
@@ -1201,8 +1297,9 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
 AT_ERROR("symeig: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
-  auto eigvals_data = eigvals.data_ptr<scalar_t>();
+  auto eigvals_data = eigvals.data_ptr<value_t>();
   auto self_matrix_stride = matrixStride(self);
   auto eigvals_stride = eigvals.size(-1);
   int64_t batch_size = batchCount(self);
@@ -1223,20 +1320,30 @@ AT_ERROR("symeig: MAGMA library not found in "
   scalar_t wkopt;
   magma_int_t liwork = -1;
   magma_int_t iwkopt;
-  magmaSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &iwkopt, liwork, &info);
+  magma_int_t lrwork = -1;
+  value_t rwkopt;
+  magmaSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &rwkopt, lrwork, &iwkopt, liwork, &info);
 
   scalar_t* work;
   magma_int_t* iwork;
-  lwork = magma_int_cast(wkopt, "work_size");
+  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
   liwork = magma_int_cast(iwkopt, "iwork_size");
   ALLOCATE_ARRAY(work, scalar_t, lwork);
   ALLOCATE_ARRAY(iwork, magma_int_t, liwork);
 
+  value_t* rwork = nullptr;
+  c10::Storage storage_rwork;
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    lrwork = magma_int_cast(rwkopt, "rwork_size");
+    storage_rwork = pin_memory<value_t>(lrwork);
+    rwork = static_cast<value_t*>(storage_rwork.data());
+  }
+
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
-    magmaSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr,
-                          wA, n, work, lwork, iwork, liwork, &info);
+    value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+    magmaSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr,
+                          wA, n, work, lwork, rwork, lrwork, iwork, liwork, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -1250,6 +1357,7 @@ std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvec
 
   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
+  ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
 
   // magmaSymeig uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors.
   // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues.
@@ -1257,15 +1365,15 @@ std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvec
   // In the case where self.numel() == 0, we just return an empty tensor of
   // dimensions on the CUDA (to avoid the unnecessary "to(at::kCUDA)")
   auto eigvals_working_copy = self.numel() == 0
-                              ? at::empty(self_sizes, self.options())
-                              : at::empty(self_sizes, self.options().device(at::kCPU));
+                              ? at::empty(self_sizes, self.options().dtype(dtype))
+                              : at::empty(self_sizes, self.options().dtype(dtype).device(at::kCPU));
 
   if (self.numel() == 0) {
     return std::tuple<Tensor, Tensor>(eigvals_working_copy, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
   }
 
   auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cuda", [&]{
     apply_symeig<scalar_t>(self_working_copy, eigvals_working_copy, eigenvectors, upper, infos);
   });
 
@@ -1290,9 +1398,10 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
 AT_ERROR("svd: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto U_data = U.data_ptr<scalar_t>();
-  auto S_data = S.data_ptr<scalar_t>();
+  auto S_data = S.data_ptr<value_t>();
   auto VT_data = VT.data_ptr<scalar_t>();
   auto self_stride = matrixStride(self);
   auto U_stride = matrixStride(U);
@@ -1304,7 +1413,18 @@ AT_ERROR("svd: MAGMA library not found in "
 
   magma_int_t m = magma_int_cast(self.size(-2), "m");
   magma_int_t n = magma_int_cast(self.size(-1), "n");
-  auto k = std::min(m, n);
+  auto mn = std::min(m, n);
+
+  c10::Storage storage_rwork;
+  value_t* rwork = nullptr;
+
+  magma_int_t* iwork;
+  ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn);
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    auto lrwork = computeLRWorkDim(jobchar, m, n);
+    storage_rwork = pin_memory<value_t>(lrwork);
+    rwork = static_cast<value_t*>(storage_rwork.data());
+  }
 
   magma_int_t info = 0;
   // Run once, first to get the optimum work size.
@@ -1313,22 +1433,20 @@ AT_ERROR("svd: MAGMA library not found in "
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   magma_int_t lwork = -1;
   scalar_t wkopt;
-  magma_int_t* iwork;
-  ALLOCATE_ARRAY(iwork, magma_int_t, 8 * k);
-  magmaSvd<scalar_t>(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, iwork, &info);
-  lwork = magma_int_cast(wkopt, "work_size");
+  magmaSvd<scalar_t, value_t>(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, rwork, iwork, &info);
+  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
   scalar_t* work;
   ALLOCATE_ARRAY(work, scalar_t, lwork);
 
   for (int64_t i = 0; i < batchsize; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_stride];
-    scalar_t* S_working_ptr = &S_data[i * S_stride];
+    value_t* S_working_ptr = &S_data[i * S_stride];
     scalar_t* U_working_ptr = &U_data[i * U_stride];
     scalar_t* VT_working_ptr = &VT_data[i * VT_stride];
 
     // Compute S, U (optionally), VT (optionally)
-    magmaSvd<scalar_t>(jobz, m, n, self_working_ptr, m,
-                       S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, iwork, &info);
+    magmaSvd<scalar_t, value_t>(jobz, m, n, self_working_ptr, m,
+                                S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, rwork, iwork, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -1361,7 +1479,7 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool som
                                                at::TensorOptions(at::kCPU).dtype(self.dtype()).pinned_memory(true));
     self_working_copy.copy_(self);
 
-    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "svd_cuda", [&]{
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "svd_cuda", [&] {
       apply_svd<scalar_t>(self_working_copy, U_working_copy, S_working_copy, VT_working_copy, jobchar, infos);
     });
 
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
index 85014c5773ee..dc6dc2f9daca 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@@ -7,8 +7,8 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
-// some cusolver functions doesn't work well on cuda 9.2, cusolver is used on cuda >= 10.0
+#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && CUSOLVER_VERSION >= 10200
+// some cusolver functions don't work well on cuda 9.2 or cuda 10.1.105, cusolver is used on cuda >= 10.1.243
 #define USE_CUSOLVER
 #endif
 
diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
index f05d73453dcf..864fb0a848df 100644
--- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
@@ -8,12 +8,20 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct AddFunctor {
+  AddFunctor(scalar_t a): alpha(a) {}
+  __device__ __forceinline__ scalar_t operator() (const scalar_t a, const scalar_t b) const {
+    return a + alpha * b;
+  }
+  private:
+    scalar_t alpha;
+};
+
 void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
-    auto alpha = alpha_scalar.to<scalar_t>();
-    gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-      return a + alpha * b;
-    });
+    AddFunctor<scalar_t> f(alpha_scalar.to<scalar_t>());
+    gpu_kernel_with_scalars(iter, f);
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
index 128c05bed3cb..30894b568762 100644
--- a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
@@ -9,60 +9,67 @@
 
 namespace at { namespace native {
 
-void bitwise_and_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(bool a, bool b) {
-          return a && b;
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a & b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseAndFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a & b;
+  }
+};
+
+template<>
+struct BitwiseAndFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a && b;
   }
+};
+
+void bitwise_and_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_and_cuda", [&]() {
+    BitwiseAndFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
-void bitwise_or_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(bool a, bool b) {
-          return a || b;
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a | b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseOrFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a | b;
+  }
+};
+
+template<>
+struct BitwiseOrFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a || b;
   }
+};
+
+void bitwise_or_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_or_cuda", [&]() {
+    BitwiseOrFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
-void bitwise_xor_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    // Boolean type does not work with ^ (bitwise XOR) in C++. bitwise_xor wraps this operation for both Boolean and
-    // integral types.
-    gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(bool a, bool b) {
-            return a != b;
-          });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a ^ b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseXorFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a ^ b;
   }
+};
+
+template<>
+struct BitwiseXorFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a != b;
+  }
+};
+
+void bitwise_xor_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_xor_cuda", [&]() {
+    BitwiseXorFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
 REGISTER_DISPATCH(bitwise_and_stub, &bitwise_and_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index 2a9b188520f5..de11baa28210 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -10,7 +10,8 @@
 namespace at { namespace native {
 
 void logical_and_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_and_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_and_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a && b;
     });
@@ -18,7 +19,8 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_or_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_or_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_or_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a || b;
     });
@@ -26,7 +28,8 @@ void logical_or_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_xor_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_xor_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_xor_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return bool(a) != bool(b);
     });
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index a2ffdb75c84b..fc9aa74f91f4 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -19,11 +19,12 @@ void atan2_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void smooth_l1_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&]() {
-    gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+void smooth_l1_kernel_cuda(TensorIterator& iter, double beta) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&iter, beta]() {
+    scalar_t beta_val(beta);
+    gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
       auto z = ::abs(a - b);
-      return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5);
+      return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index 044fc955b954..be3f4f0bb01e 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -11,6 +11,39 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t, typename accscalar_t>
+struct MulScalarFunctor {
+    MulScalarFunctor(accscalar_t b_): b(b_) {}
+    __device__ scalar_t operator() (scalar_t a) const {
+      return a * b;
+    }
+  private:
+    accscalar_t b;
+};
+
+template<typename scalar_t>
+struct DivFunctor {
+  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+    return a / b;
+  }
+};
+
+template<typename scalar_t>
+struct MulFunctor {
+  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+    return a * b;
+  }
+};
+
+// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
+template<>
+struct MulFunctor<bool> {
+  __device__ bool operator() (bool a, bool b) const {
+    return a && b;
+  }
+};
+
+
 void div_kernel_cuda(TensorIterator& iter) {
   if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
     // optimization for floating-point types: if the second operand is a CPU
@@ -20,44 +53,35 @@ void div_kernel_cuda(TensorIterator& iter) {
       using accscalar_t = at::acc_type<scalar_t, true>;
       auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
       iter.remove_operand(2);
-      gpu_kernel(iter, [inv_b]GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return a * inv_b;
-      });
+      MulScalarFunctor<scalar_t, decltype(inv_b)> f(inv_b);
+      gpu_kernel(iter, f);
     });
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_cuda", [&]() {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a / b;
-      });
+      DivFunctor<scalar_t> f;
+      gpu_kernel_with_scalars(iter, f);
     });
   }
 }
 
 void mul_kernel_cuda(TensorIterator& iter) {
-  if (iter.common_dtype() == ScalarType::Bool) {
-    // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(bool a, bool b) -> bool {
-      return a && b;
-    });
-  } else if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) &&
+  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) &&
     (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
-  //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
-  //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
-          AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
-            using accscalar_t = at::acc_type<scalar_t, true>;
-            int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
-            auto b = iter.scalar_value<accscalar_t>(scalar_arg);
-            iter.remove_operand(scalar_arg);
-            const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
-            gpu_kernel(iter, [b]GPU_LAMBDA(scalar_t a) -> scalar_t {
-              return a * b;
-            });
-          });
+    //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
+    //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
+      auto b = iter.scalar_value<accscalar_t>(scalar_arg);
+      iter.remove_operand(scalar_arg);
+      const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
+      MulScalarFunctor<scalar_t, decltype(b)> f(b);
+      gpu_kernel(iter, f);
+    });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * b;
-      });
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
+      MulFunctor<scalar_t> f;
+      gpu_kernel_with_scalars(iter, f);
     });
   }
 }
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
index 947b53bce8fd..20f76ce0d8e1 100644
--- a/aten/src/ATen/native/cuda/CompareEQKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareEqFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a == b;
+  }
+};
+
 void eq_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a == b;
-    });
+    gpu_kernel_with_scalars(iter, CompareEqFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareGEKernel.cu b/aten/src/ATen/native/cuda/CompareGEKernel.cu
index e276237ea8e6..c96b7f3929bc 100644
--- a/aten/src/ATen/native/cuda/CompareGEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareGEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareGEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a >= b;
+  }
+};
+
 void ge_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ge_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a >= b;
-    });
+    gpu_kernel_with_scalars(iter, CompareGEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareGTKernel.cu b/aten/src/ATen/native/cuda/CompareGTKernel.cu
index c17b14855dd6..cbd189ed1b6d 100644
--- a/aten/src/ATen/native/cuda/CompareGTKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareGTKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareGTFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a > b;
+  }
+};
+
 void gt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "gt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a > b;
-    });
+    gpu_kernel_with_scalars(iter, CompareGTFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareLEKernel.cu b/aten/src/ATen/native/cuda/CompareLEKernel.cu
index 3987b87e918c..13e60a78ffb2 100644
--- a/aten/src/ATen/native/cuda/CompareLEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareLEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareLEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a <= b;
+  }
+};
+
 void le_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "le_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a <= b;
-    });
+    gpu_kernel_with_scalars(iter, CompareLEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareLTKernel.cu b/aten/src/ATen/native/cuda/CompareLTKernel.cu
index 3684d65f6631..e301284c83e7 100644
--- a/aten/src/ATen/native/cuda/CompareLTKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareLTKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareLTFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a < b;
+  }
+};
+
 void lt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "lt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a < b;
-    });
+    gpu_kernel_with_scalars(iter, CompareLTFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareNEKernel.cu b/aten/src/ATen/native/cuda/CompareNEKernel.cu
index 0834a0d2b3bb..3ef397ec5200 100644
--- a/aten/src/ATen/native/cuda/CompareNEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareNEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareNEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a != b;
+  }
+};
+
 void ne_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ne_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a != b;
-    });
+    gpu_kernel_with_scalars(iter, CompareNEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index c629dfc4030c..3e0e70c01952 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -366,70 +366,68 @@ void max_pool2d_with_indices_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        scalar_t *input_data = input.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        switch (memory_format) {
-          case MemoryFormat::ChannelsLast: {
-            const int max_threads = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
-            int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
-            int block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
-            int block_y = std::min<int>(
-                maxThreadsDim[1], std::min<int>(lastPow2(outputWidth), max_threads / block_x));
-            int block_z = std::min<int>(
-                maxThreadsDim[2], std::min<int>(lastPow2(outputHeight), max_threads / block_x / block_y));
-            block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
-            const dim3 block(block_x, block_y, block_z);
-
-            int kernel_stride_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * 4); 
-            int kernel_size_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C); 
-
-            int grid_x = nbatch*kernel_stride_C;
-            int grid_y = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputWidth), block_y*BLOCK_STRIDE));
-            int grid_z = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE));
-            const dim3 grid(grid_x, grid_y, grid_z);
-
-            size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
-            AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
-
-            max_pool_forward_nhwc<scalar_t, scalar_t>
-            <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
-                input_data, nbatch, 
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    in_stride_n, in_stride_c, 
-                    in_stride_h, in_stride_w,
-                    kernel_stride_C, kernel_size_C, 
-                    output_data, indices_data);
-            break;
-          }
-          case MemoryFormat::Contiguous: {
-            const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-                                             BLOCK_THREADS);
-            max_pool_forward_nchw<scalar_t, scalar_t>
-                <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                count, input_data,
-                    nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-            break;
-          }
-          default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *output_data = output.data_ptr<scalar_t>();
+      scalar_t *input_data = input.data_ptr<scalar_t>();
+      int64_t *indices_data = indices.data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
+          int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(outputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(outputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4); 
+          int kernel_size_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C); 
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
+          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
+
+          max_pool_forward_nhwc<scalar_t, scalar_t>
+          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+              input_data, nbatch, 
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  in_stride_n, in_stride_c, 
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C, 
+                  output_data, indices_data);
+          break;
         }
-      });
+        case MemoryFormat::Contiguous: {
+          const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                            BLOCK_THREADS);
+          max_pool_forward_nchw<scalar_t, scalar_t>
+              <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count, input_data,
+                  nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  output_data, indices_data);
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
     }
   );
 
@@ -532,88 +530,86 @@ void max_pool2d_with_indices_backward_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-
-        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
-        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        switch (memory_format) {
-          case MemoryFormat::ChannelsLast: {
-            const int max_threads = std::min<int>(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
-            int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
-            int block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
-            int block_y = std::min<int>(
-                maxThreadsDim[1], std::min<int>(lastPow2(inputWidth), max_threads / block_x));
-            int block_z = std::min<int>(
-                maxThreadsDim[2], std::min<int>(lastPow2(inputHeight), max_threads / block_x / block_y));
-            block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
-            const dim3 block(block_x, block_y, block_z);
-
-            int kernel_stride_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
-            int kernel_size_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
-
-            int grid_x = nbatch*kernel_stride_C;
-            int grid_y = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputWidth), block_y*BLOCK_STRIDE));
-            int grid_z = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputHeight), block_z*BLOCK_STRIDE));
-            const dim3 grid(grid_x, grid_y, grid_z);
-
-            size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t);
-            AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
-
-            // The backward kernel is launched on input instead output. 
-            // If it is launched on output layer, atomic_add would not provide much benefit on FP16. 
-            // Please check comments at https://github.com/pytorch/pytorch/pull/34519. 
-            max_pool_backward_nhwc<scalar_t, accscalar_t>
-            <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
-                count,
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    out_stride_c, out_stride_h, out_stride_w,
-                    in_stride_n, in_stride_c, 
-                    in_stride_h, in_stride_w,
-                    kernel_stride_C, kernel_size_C, 
-                    gradInput_data);
-            break;
-          }
-          case MemoryFormat::Contiguous: {
-            int imgcount = inputWidth * inputHeight;
-            dim3 grid;
-            const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
-            grid.x = blocks;
-            grid.y = nbatch;
-            uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-            if (maxGridY < grid.y) grid.y = maxGridY;
-            grid.z = nInputPlane;
-            uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-            if (maxGridZ < grid.z) grid.z = maxGridZ;
-
-            max_pool_backward_nchw<scalar_t, accscalar_t>
-            <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-                count,
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-            break;
-          }
-          default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
+      scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
+      int64_t *indices_data = indices.data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
+          int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(inputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(inputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
+          int kernel_size_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t);
+          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
+
+          // The backward kernel is launched on input instead output. 
+          // If it is launched on output layer, atomic_add would not provide much benefit on FP16. 
+          // Please check comments at https://github.com/pytorch/pytorch/pull/34519. 
+          max_pool_backward_nhwc<scalar_t, accscalar_t>
+          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+              count,
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  out_stride_c, out_stride_h, out_stride_w,
+                  in_stride_n, in_stride_c, 
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C, 
+                  gradInput_data);
+          break;
         }
-      });
+        case MemoryFormat::Contiguous: {
+          int imgcount = inputWidth * inputHeight;
+          dim3 grid;
+          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
+          grid.x = blocks;
+          grid.y = nbatch;
+          uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+          if (maxGridY < grid.y) grid.y = maxGridY;
+          grid.z = nInputPlane;
+          uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+          if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+          max_pool_backward_nchw<scalar_t, accscalar_t>
+          <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count,
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  gradInput_data);
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
     }
   );
 
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
index 2b0ba37c8880..9d72e0027007 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
@@ -276,20 +276,18 @@ void max_pool3d_with_indices_out_cuda_template(
     input.scalar_type(),
     "max_pool3d_with_indices_out_frame",
     [&]{
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_out_frame", [&] {
-        scalar_t *input_data = work_input.data_ptr<scalar_t>();
-        int64_t totalZ = otime * nslices * nbatch;
-
-        max_pool3d_with_indices_out_frame(
-          input_data, work_output, work_indices,
-          totalZ,
-          itime, iheight, iwidth,
-          otime, oheight, owidth,
-          kT, kH, kW,
-          dT, dH, dW,
-          pT, pH, pW,
-          dilationT, dilationH, dilationW);
-      });
+      scalar_t *input_data = work_input.data_ptr<scalar_t>();
+      int64_t totalZ = otime * nslices * nbatch;
+
+      max_pool3d_with_indices_out_frame(
+        input_data, work_output, work_indices,
+        totalZ,
+        itime, iheight, iwidth,
+        otime, oheight, owidth,
+        kT, kH, kW,
+        dT, dH, dW,
+        pT, pH, pW,
+        dilationT, dilationH, dilationW);
     }
   );
 }
@@ -387,19 +385,17 @@ void max_pool3d_with_indices_backward_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool3d_with_indices_backward_out_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_backward_out_frame", [&] {
-        const int64_t totalZ = otime * nslices * nbatch;
-        scalar_t *grad_input_data = work_grad_input.data_ptr<scalar_t>();
-
-        max_pool3d_with_indices_backward_out_frame(
-          grad_input_data, work_grad_output, work_indices,
-          totalZ,
-          itime, iheight, iwidth,
-          oheight, owidth,
-          dT, dH, dW,
-          pT, pH, pW,
-          dilationT, dilationH, dilationW);
-      });
+      const int64_t totalZ = otime * nslices * nbatch;
+      scalar_t *grad_input_data = work_grad_input.data_ptr<scalar_t>();
+
+      max_pool3d_with_indices_backward_out_frame(
+        grad_input_data, work_grad_output, work_indices,
+        totalZ,
+        itime, iheight, iwidth,
+        oheight, owidth,
+        dT, dH, dW,
+        pT, pH, pW,
+        dilationT, dilationH, dilationW);
     }
   );
 }
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 385cac5c79e8..c43a2ae9877e 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -50,7 +50,9 @@ struct dists {
 
   // Special case backward when p is less than two
   struct lt_two {
-    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) {
+      return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1));
+    }
   };
 
   // Two norm
diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu
index 7376ecfa6394..e4fe4b68f2eb 100644
--- a/aten/src/ATen/native/cuda/FillKernel.cu
+++ b/aten/src/ATen/native/cuda/FillKernel.cu
@@ -6,12 +6,19 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct FillFunctor {
+  FillFunctor(scalar_t v): value(v) {}
+  __device__ __forceinline__ scalar_t operator() () const {
+    return value;
+  }
+  private:
+    scalar_t value;
+};
+
 void fill_kernel_cuda(TensorIterator& iter, Scalar value) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "fill_cuda", [&]() {
-    auto value_converted = value.to<scalar_t>();
-    gpu_kernel(iter, [value_converted]GPU_LAMBDA() -> scalar_t {
-      return value_converted;
-    });
+    gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 239859b9138c..cc01bb030cf4 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template<template<class> class Op>
 std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors1.size());
     for (const auto& t: tensors1) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -17,7 +18,11 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() {
-        multi_tensor_apply<3>(tensor_lists, BinaryOpListAlphaFunctor<scalar_t, Op>(), alpha.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<3>(tensor_lists,
+                              BinaryOpListAlphaFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              alpha.to<opmath_t>());
     });
 
     return tensor_lists[2];
@@ -25,12 +30,16 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
 
 template<template<class> class Op>
 void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors1.vec());
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
-        multi_tensor_apply<2>(tensor_lists, BinaryOpListAlphaFunctor_<scalar_t, Op>(), alpha.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              BinaryOpListAlphaFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              alpha.to<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 215410bbc2a5..71180785eb48 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -8,8 +8,9 @@ template<template<class> class Op>
 std::vector<Tensor> foreach_binary_op(TensorList tensors, Scalar scalar) {
     check_foreach_api_restrictions(tensors);
 
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
     for (const auto& t: tensors) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -18,7 +19,11 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, Scalar scalar) {
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, BinaryOpScalarFunctor<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              BinaryOpScalarFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
     return tensor_lists[1];
 }
@@ -27,11 +32,15 @@ template<template<class> class Op>
 void foreach_binary_op_(TensorList tensors, Scalar scalar) {
     check_foreach_api_restrictions(tensors);
 
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, BinaryOpScalarFunctor_<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              BinaryOpScalarFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
new file mode 100644
index 000000000000..60f2bb737bf7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -0,0 +1,69 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+
+namespace at { namespace native {
+
+template<template<class> class Op>
+std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(vec_res);
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              scalars,
+                              BinaryOpScalarListFunctor<scalar_t>(),
+                              Op<opmath_t>());
+    });
+    return tensor_lists[1];
+}
+
+template<template<class> class Op>
+void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists;
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              scalars,
+                              BinaryOpScalarListFunctor_<scalar_t>(),
+                              Op<opmath_t>());
+    });
+}
+
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
+void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<double> scalars) {                 \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
+    }                                                                                                                    \
+                                                                                                                         \
+    foreach_binary_op_<OP>(tensors, scalars);                                                                            \
+}                                                                                                                        \
+                                                                                                                         \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<double> scalars) {   \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
+    }                                                                                                                    \
+                                                                                                                         \
+    return foreach_binary_op<OP>(tensors, scalars);                                                                      \
+}
+
+FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
+FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
+FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
+FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index a04d27110c9a..dd01d584f045 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -5,12 +5,19 @@ namespace at { namespace native {
 
 namespace {
 
-template<typename T, template<class> class Op>
+// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
+template<typename scalar_t> struct get_opmath_t { using opmath_t = scalar_t; };
+template<> struct get_opmath_t<at::Half> { using opmath_t = float; };
+template<> struct get_opmath_t<at::BFloat16> { using opmath_t = float; };
+
+template<typename T>
 struct BinaryOpScalarFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<1>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -29,7 +36,8 @@ struct BinaryOpScalarFunctor_ {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -47,7 +55,8 @@ struct BinaryOpScalarFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -60,12 +69,14 @@ struct BinaryOpScalarFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpScalarFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<2>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -87,7 +98,8 @@ struct BinaryOpScalarFunctor {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -105,7 +117,8 @@ struct BinaryOpScalarFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -118,12 +131,135 @@ struct BinaryOpScalarFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
+struct BinaryOpScalarListFunctor_ {
+    using io_t = T;
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<opmath_t, 1>& tl,
+        Op op) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            opmath_t y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
+                    }
+                    // store
+                    load_store(x, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            x[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
+template<typename T>
+struct BinaryOpScalarListFunctor {
+    using io_t = T;
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<opmath_t, 2>& tl,
+        Op op) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            T* out = (T*)tl.addresses[1][tensor_loc];
+            out += chunk_idx * chunk_size;
+
+            opmath_t y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
+                    }
+                    // store
+                    load_store(out, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            out[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
+template<typename T>
 struct BinaryOpListAlphaFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<2>& tl, 
-        T alpha) {
+        TensorListMetadata<2>& tl,
+        Op op,
+        opmath_t alpha) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -147,7 +283,8 @@ struct BinaryOpListAlphaFunctor_ {
                     load_store(r_y, y, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start , 0);
@@ -167,7 +304,8 @@ struct BinaryOpListAlphaFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -180,12 +318,14 @@ struct BinaryOpListAlphaFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpListAlphaFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<3>& tl,
-        T alpha) {
+        Op op,
+        opmath_t alpha) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -212,7 +352,8 @@ struct BinaryOpListAlphaFunctor {
                     load_store(r_y, y, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start , 0);
@@ -232,7 +373,8 @@ struct BinaryOpListAlphaFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -245,11 +387,13 @@ struct BinaryOpListAlphaFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct UnaryOpFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<1>& tl) {
+        TensorListMetadata<1>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -268,7 +412,7 @@ struct UnaryOpFunctor_ {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -286,7 +430,7 @@ struct UnaryOpFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -299,11 +443,13 @@ struct UnaryOpFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct UnaryOpFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<2>& tl) {
+        TensorListMetadata<2>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -325,7 +471,7 @@ struct UnaryOpFunctor {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -343,7 +489,7 @@ struct UnaryOpFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -356,12 +502,14 @@ struct UnaryOpFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct PointwiseOpFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<3>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -390,7 +538,9 @@ struct PointwiseOpFunctor_ {
                     load_store(r_z, z, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -412,7 +562,9 @@ struct PointwiseOpFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -425,12 +577,14 @@ struct PointwiseOpFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct PointwiseOpFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<4>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -462,7 +616,9 @@ struct PointwiseOpFunctor {
                     load_store(r_z, z, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -485,7 +641,9 @@ struct PointwiseOpFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index b514f3294c52..7ce2fc566110 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template<template<class> class Op>
 std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(input.size());
     for (const auto& t: input) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -18,7 +19,11 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() {
-        multi_tensor_apply<4>(tensor_lists, PointwiseOpFunctor<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<4>(tensor_lists,
+                              PointwiseOpFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 
     return tensor_lists[3];
@@ -26,13 +31,17 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
 
 template<template<class> class Op>
 void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(input.vec());
     tensor_lists.emplace_back(tensors1.vec());
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() {
-        multi_tensor_apply<3>(tensor_lists, PointwiseOpFunctor_<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<3>(tensor_lists,
+                              PointwiseOpFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 }
 
@@ -59,7 +68,7 @@ void foreach_tensor_##NAME##_cuda_(TensorList input, TensorList tensors1, Tensor
     if (!can_use_fast_route(input, scalar) ||                                                                                  \
         !can_use_fast_route(tensors1, tensors2) ||                                                                             \
         !can_use_fast_route(input, tensors1)) {                                                                                \
-        at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar);                                          \
+        return at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar);                                   \
     }                                                                                                                          \
                                                                                                                                \
     foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalar);                                                              \
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 32bb6ab6b509..1160d64bba6d 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template <template<class> class Op>
 std::vector<Tensor> foreach_unary_op(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
     for (const auto& t: tensors) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -16,18 +17,24 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              UnaryOpFunctor<scalar_t>(),
+                              Op<opmath_t>());
     });
     return tensor_lists[1];
 }
 
 template <template<class> class Op>
 void foreach_unary_op_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor_<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              UnaryOpFunctor_<scalar_t>(),
+                              Op<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index b69267e90437..9c3eab4497aa 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -4,9 +4,12 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/core/Array.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/ExpandUtils.h>
+#include <THC/THCTensorInfo.cuh>
 
 namespace at { namespace native {
 
@@ -14,6 +17,54 @@ static constexpr int launch_bound2 = 4;
 
 static constexpr int launch_size_nd = 128;
 
+template <int Dims, typename T, typename IndexType>
+__device__ __forceinline__ IndexType indexToOffset(
+    const cuda::detail::TensorInfo<T, IndexType>& info,
+    int64_t index,
+    IndexType size) {
+  IndexType linearIndex = static_cast<IndexType>(index);
+  CUDA_KERNEL_ASSERT(linearIndex < size && linearIndex >= -size);
+  if (linearIndex < 0) {
+    linearIndex += size;
+  }
+  return cuda::detail::IndexToOffset<T, IndexType, Dims>::get(linearIndex, info);
+}
+
+template<typename IndexType, typename T>
+void dispatchTakePutImpl(const Tensor& input, Tensor& output, const Tensor& index) {
+  auto inputInfo = cuda::detail::getTensorInfo<T, IndexType>(input);
+  inputInfo.collapseDims();
+  auto numel = input.numel();
+  if (inputInfo.isContiguous()) {
+    cuda::CUDA_tensor_apply2<T, int64_t>(
+        output,
+        index,
+        [inputInfo, numel] __device__ (
+            T & out, const int64_t& idx) {
+            auto offset = indexToOffset<-2, T, IndexType>(inputInfo, idx, numel);
+            out = inputInfo.data[offset];
+        });
+  } else {
+    cuda::CUDA_tensor_apply2<T, int64_t>(
+        output,
+        index,
+        [inputInfo, numel] __device__ (
+            T & out, const int64_t& idx) {
+            auto offset = indexToOffset<-1, T, IndexType>(inputInfo, idx, numel);
+            out = inputInfo.data[offset];
+        });
+  }
+}
+
+template<typename T>
+void dispatchTakePut(const Tensor& input, Tensor& output, const Tensor& index) {
+  if (cuda::detail::canUse32BitIndexMath(input)) {
+    dispatchTakePutImpl<int32_t, T>(input, output, index);
+  } else {
+    dispatchTakePutImpl<int64_t, T>(input, output, index);
+  }
+}
+
 template<int nt, int vt, typename func_t>
 C10_LAUNCH_BOUNDS_2(nt, launch_bound2)
 __global__ void index_elementwise_kernel(int N, func_t f) {
@@ -154,6 +205,48 @@ Tensor & masked_select_out_cuda(Tensor & result, const Tensor & self, const Tens
   return masked_select_out_cuda_impl(result, self, mask);
 }
 
+void take_out_cuda_template(Tensor& output, const Tensor& input, const Tensor& index) {
+  TORCH_CHECK(output.device().type() == at::kCUDA, "device type of output (", output.device().type(), ") is not GPU");
+  TORCH_CHECK(input.device().type() == at::kCUDA, "device type of input (", input.device().type(), ") is not GPU");
+  TORCH_CHECK(index.device().type() == at::kCUDA, "device type of index (", index.device().type(), ") is not GPU");
+
+  TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", output.layout(), " on output tensor");
+  TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", input.layout(), " on input tensor");
+  TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", index.layout(), " on index tensor");
+
+  TORCH_CHECK(output.scalar_type() == input.scalar_type(),
+          "output and input scalar type must match. but got different types: ", output.scalar_type(), " and ", input.scalar_type());
+  TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor");
+
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg input_arg{ input, "input", 2 };
+  TensorArg index_arg{ index, "index", 3 };
+  checkAllSameGPU("take", {output_arg, input_arg, index_arg});
+
+  TORCH_CHECK(input.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+  TORCH_CHECK(output.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+  TORCH_CHECK(index.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+
+  TORCH_CHECK(!(input.numel() == 0 && index.numel() != 0), "tried to take from an empty tensor");
+
+  output.resize_(index.sizes());
+
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cuda", [&] {
+    dispatchTakePut<scalar_t>(input, output, index);
+  });
+}
+
+Tensor take_cuda(const Tensor& self, const Tensor& index) {
+    auto out = at::empty(index.sizes(), self.options());
+    take_out_cuda_template(out, self, index);
+    return out;
+}
+
+Tensor& take_out_cuda(Tensor& out, const Tensor& self, const Tensor& index) {
+    take_out_cuda_template(out, self, index);
+    return out;
+}
+
 REGISTER_DISPATCH(index_stub, &index_kernel);
 REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
 
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index c78029d6a7e0..76f5c0a99efe 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2,
   return self;
 }
 
-template<typename scalar_t>
-void addr_impl_ger_cuda(Tensor &out, const Tensor &self,
-                        const Tensor& vec1, const Tensor& vec2,
-                        scalar_t alpha, scalar_t beta) {
-  static_assert(std::is_same<scalar_t, float>::value ||
-                std::is_same<scalar_t, double>::value,
-                "addr_impl_ger_cuda: only float and double are supported");
-  if (&out != &self) {
-    at::native::resize_as_(out, self);
-    at::native::copy_(out, self);
-  }
-  if (beta == 0.0) {
-    at::native::zero_(out);
-  }
-  if (beta != 1.0) {
-    at::native::mul_(out, beta);
-  }
-  if (out.stride(0) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec1.size(0), vec2.size(0), alpha,
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(1)
-    );
-  } else if (out.stride(1) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-  } else {
-    Tensor cr = out.clone();
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-    out.set_(cr);
-  }
-}
-
-template<typename scalar_t>
-void addr_impl_cuda(Tensor &out, const Tensor &self,
-                    const Tensor& vec1, const Tensor& vec2,
-                    scalar_t alpha, scalar_t beta) {
-  // currently no Hger/SgerEx in Cublas.
-  Tensor vec2T = vec2.reshape({1, vec2.size(0)});
-  Tensor vec1M = vec1.reshape({vec1.size(0), 1});
-  addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha);
-}
-template<>
-void addr_impl_cuda<float>(Tensor &out, const Tensor &self,
-                           const Tensor& vec1, const Tensor& vec2,
-                           float alpha, float beta) {
-  addr_impl_ger_cuda<float>(out, self, vec1, vec2, alpha, beta);
-}
-template<>
-void addr_impl_cuda<double>(Tensor &out, const Tensor &self,
-                            const Tensor& vec1, const Tensor& vec2,
-                            double alpha, double beta) {
-  addr_impl_ger_cuda<double>(out, self, vec1, vec2, alpha, beta);
-}
-
-Tensor& addr_out_cuda(Tensor &out, const Tensor& self,
-                      const Tensor& vec1, const Tensor& vec2,
-                      Scalar beta, Scalar alpha) {
-  TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1,
-              "vec1 and vec2 should be 1-dimensional vectors. Got dimensions ",
-              vec1.dim(), " and ", vec2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == vec1.device() &&
-              out.device() == vec2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              vec1.device(), " and ", vec2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0),
-              "size mismatch",
-              ", input: ", self_.sizes(),
-              ", v1: ", vec1.sizes(),
-              ", v2: ", vec2.sizes());
-  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] {
-      addr_impl_cuda<scalar_t>(out, self_, vec1, vec2,
-                               alpha.to<scalar_t>(), beta.to<scalar_t>());
-  });
-  return out;
-}
-
-Tensor& addr__cuda(Tensor& self,
-                   const Tensor& vec1, const Tensor& vec2,
-                   Scalar beta, Scalar alpha) {
-  addr_out_cuda(self, self, vec1, vec2, beta, alpha);
-  return self;
-}
-
-Tensor addr_cuda(const Tensor& self,
-                  const Tensor& vec1, const Tensor& vec2,
-                  Scalar beta, Scalar alpha) {
-  Tensor out = at::empty({0}, self.options());
-  addr_out_cuda(out, self, vec1, vec2, beta, alpha);
-  return out;
-}
-
 Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
                         const Tensor& batch1, const Tensor& batch2,
                         Scalar beta, Scalar alpha) {
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index bb913dc0ec9e..412f6b70c2c5 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -140,7 +140,6 @@ struct BUnaryFunctor {
 
 template <typename func_t>
 void gpu_kernel_with_scalars(TensorIterator& iter, const func_t& f) {
-  ASSERT_HOST_DEVICE_LAMBDA(func_t);
   TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
 
   using traits = function_traits<func_t>;
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index f82a0d9a58c8..f0f8f97fabb1 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -35,25 +36,95 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
+template<typename scalar_vals_t, int n> struct TensorListScalarListMetadata
+{
+  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
+  int sizes[depth_to_max_tensors_scalarlist[n-1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+};
+
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
-__global__ void 
+__global__ void
 multi_tensor_apply_kernel(
     T tensorListMeta,
     U callable,
     ArgTypes... args) {
   // Hand the chunk information to the user-supplied functor to process however it likes.
-  callable(kChunkSize, tensorListMeta, args...); 
+  callable(kChunkSize, tensorListMeta, args...);
 }
 
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+        size_t n_tensors = tensor_lists[0].size();
+        using scalar_vals_t = typename T::opmath_t;
+        TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+
+        int loc_block_info = 0;
+        int loc_tensor_info = 0;
+        for(size_t t = 0; t < n_tensors; t++) {
+
+            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
+
+            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+            for (int d = 0; d < depth; d++) {
+                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+            }
+            loc_tensor_info++;
+
+            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
+            for (int chunk = 0; chunk < chunks; chunk++) {
+                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+                loc_block_info++;
 
+                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
+                    chunk == chunks - 1);
+                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
+
+                if (tensors_full || blocks_full || last_chunk) {
+                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        tensorListMeta,
+                        callable,
+                        args...);
+
+                    AT_CUDA_CHECK(cudaGetLastError());
+
+                    // Reset.
+                    loc_block_info = 0;
+                    if(chunk == chunks - 1) {
+                        loc_tensor_info = 0;
+                    }
+                    else {
+                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
+                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
+                        for(int d = 0; d < depth; d++) {
+                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
+                        }
+                        loc_tensor_info = 1;
+                    }
+                }
+            }
+        }
+    }
+
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
+        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
 
@@ -88,7 +159,7 @@ void multi_tensor_apply(
                     // Reset.
                     loc_block_info = 0;
                     if(chunk == chunks - 1) {
-                        loc_tensor_info = 0; 
+                        loc_tensor_info = 0;
                     }
                     else {
                         tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
index 10138f4bced0..13149759926d 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
@@ -684,10 +684,7 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template(
             // Matrix mulitply per output:
             input_n = input.select(0, elt);
 
-            if (kernel_height == 1 && kernel_width == 1) {
-              // for 1x1 column skip im2col step
-              columns.copy_(grad_output_n);
-            } else {
+            if (kernel_height != 1 || kernel_width != 1) {
               // Extract columns:
               im2col<scalar_t>(
                   at::cuda::getCurrentCUDAStream(),
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 501ef90477da..33162b3d5271 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -26,17 +26,18 @@ void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) {
-  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&]() {
+void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm, double beta) {
+  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&iter, &norm, beta] {
       auto norm_val = norm.to<scalar_t>();
-      gpu_kernel(iter, [norm_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
+      scalar_t beta_val(beta);
+      gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < scalar_t(-1))
+        if (x < -beta_val)
           return -norm_val * grad_output;
-        else if (x > scalar_t(1))
+        else if (x > beta_val)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
index 83d11ed9f9e1..cb070e15f191 100644
--- a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
@@ -13,20 +13,32 @@
 
 namespace at { namespace native {
 
+template <typename acc_t>
+struct MaxNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+      return (THCNumerics<acc_t>::isnan(a) || a > b) ? a : b;
+  }
+};
+
 template <typename scalar_t, typename acc_t=scalar_t>
 void max_values_kernel_cuda_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
-    iter, func_wrapper<acc_t> ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-      return (THCNumerics<acc_t>::isnan(a) || a > b) ? a : b;
-    }), at::numeric_limits<acc_t>::lower_bound());
+    iter, func_wrapper<acc_t> (MaxNanFunctor<acc_t>()),
+    at::numeric_limits<acc_t>::lower_bound());
 }
 
+template <typename acc_t>
+struct MinNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+      return (THCNumerics<acc_t>::isnan(a) || a < b) ? a : b;
+  }
+};
+
 template <typename scalar_t, typename acc_t=scalar_t>
 void min_values_kernel_cuda_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
-    iter, func_wrapper<acc_t> ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-      return (THCNumerics<acc_t>::isnan(a) || a < b) ? a : b;
-    }), at::numeric_limits<acc_t>::upper_bound());
+    iter, func_wrapper<acc_t> (MinNanFunctor<acc_t>()),
+    at::numeric_limits<acc_t>::upper_bound());
 }
 
 void max_values_kernel_cuda(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
index eb9250befd56..6046bc9a1f01 100644
--- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
@@ -114,7 +114,7 @@ __host__ __device__ static inline c10::complex<double> nearbyint_wrapper(c10::co
 }
 
 void round_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "round_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "round_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       // We do not use std::round because we would like to round midway numbers to the nearest even integer.
       return nearbyint_wrapper(a);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 1067d7c61bc5..5b545471fb34 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -10,6 +10,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/Math.cuh>
+#include <ATen/NumericUtils.h>
 #include <c10/cuda/CUDAMathCompat.h>
 #include <c10/util/complex.h>
 
@@ -180,6 +181,32 @@ void clamp_max_kernel_cuda(TensorIterator& iter, Scalar max_value) {
   });
 }
 
+void nan_to_num_kernel_cuda(
+    TensorIterator& iter,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "nan_to_num_cuda", [&]() {
+    scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+    scalar_t pos_inf_replacement = pos_inf.has_value()
+        ? static_cast<scalar_t>(pos_inf.value())
+        : std::numeric_limits<scalar_t>::max();
+    scalar_t neg_inf_replacement = neg_inf.has_value()
+        ? static_cast<scalar_t>(neg_inf.value())
+        : std::numeric_limits<scalar_t>::lowest();
+    gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return (
+          at::_isnan(a)
+              ? nan_replacement
+              : (a == std::numeric_limits<scalar_t>::infinity()
+                     ? pos_inf_replacement
+                     : (a == -std::numeric_limits<scalar_t>::infinity()
+                            ? neg_inf_replacement
+                            : a)));
+    });
+  });
+}
+
 void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta){
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
     AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "kaiser_window_cuda", [&] {
@@ -206,6 +233,7 @@ REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda);
 REGISTER_DISPATCH(clamp_stub, &clamp_kernel_cuda);
 REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_cuda);
 REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_cuda);
+REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel_cuda);
 REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda);
 
 } // namespace native
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index 3d90089556be..aae3906575f9 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -11,21 +11,19 @@
 namespace at { namespace native {
 
 void logical_not_kernel_cuda(TensorIterator& iter) {
-  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND2(...)
+  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...)
   // so we don't have to maintain a separate list or to do double dispatch.
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cuda", [&]() {});
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cuda", [&]() {});
 
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !a; });
   });
 }
 
 void neg_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "neg_cuda", [&] {
-      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return -a;
-      });
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return -a;
     });
   });
 }
@@ -51,9 +49,26 @@ void signbit_kernel_cuda(TensorIterator& iter){
   });
 }
 
+template<typename T>
+__host__ __device__ static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / std::abs(z);
+  }
+}
+
+void sgn_kernel_cuda(TensorIterator& iter){
+  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return sgn_wrapper(a);
+      });
+  });
+}
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda);
 REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda);
 REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel_cuda);
+REGISTER_DISPATCH(sgn_stub, &sgn_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index 4ddd533ec8f8..4524af2fe244 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -1,5 +1,6 @@
 #include <limits>
 #include <vector>
+#include <sstream>
 #include <functional>
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
@@ -100,6 +101,10 @@ std::tuple<at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
 //     if(dataType == CUDNN_DATA_HALF)
 //       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 //
+//     Update: AT_CUDNN_CHECK is updated with AT_CUDNN_CHECK_WITH_SHAPES, which
+//        automatically prints tensor shapes and convolution parameters if there is
+//        a cuDNN exception thrown.
+//
 // When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs
 // cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos.
 // If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind,
@@ -220,6 +225,19 @@ struct ConvolutionParams
   // forward and backward, so you can reuse the benchmark entry,
 };
 
+std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) {
+  out << "ConvolutionParams \n"
+    << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
+    << "    padding = " << ArrayRef<int>{params.padding} << "\n"
+    << "    stride = " << ArrayRef<int>{params.stride} << "\n"
+    << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
+    << "    groups = " << params.groups << "\n"
+    << "    deterministic = " << (params.deterministic ? "true" : "false") << "\n"
+    << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+
+  return out;
+}
+
 // NB: This can't be a constructor, because then ConvolutionParams
 // would not be a POD anymore.
 // TODO: Use TensorGeometry here instead of the entire Tensor, which we
@@ -268,6 +286,61 @@ struct ConvolutionArgs {
   }
 };
 
+std::string repro_from_args(const ConvolutionArgs& args) {
+  auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; };
+  std::string partial_dtype;
+  switch (args.params.dataType) {
+    case CUDNN_DATA_FLOAT: partial_dtype = "float"; break;
+    case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break;
+    case CUDNN_DATA_HALF: partial_dtype = "half"; break;
+    default: partial_dtype = "unsupported";
+  }
+  const std::string full_dtype = "torch." + partial_dtype;
+  const int out_channels = args.weight.sizes()[0];
+  const int in_channels = args.weight.sizes()[1] * args.params.groups;
+  const size_t dim = args.input.sizes().size();
+  const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d";
+  const std::string to_channels_last = args.input.suggest_memory_format() == at::MemoryFormat::ChannelsLast \
+    ? ".to(memory_format=torch." + channels_last_xd + ")" : "";
+
+  std::ostringstream ss;
+  ss << "You can try to repro this exception using the following code snippet. ";
+  ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
+  ss << "import torch\n";
+  ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
+  ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+  ss << "torch.backends.cudnn.deterministic = " << pybool(args.params.deterministic) << "\n";
+  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(args.params.allow_tf32) << "\n";
+  ss << "data = torch.randn(" << args.input.sizes() << ", dtype=" << full_dtype << ", ";
+  ss <<   "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", ";
+  ss <<   "kernel_size=" << args.weight.sizes().slice(2) << ", ";
+  ss <<   "padding=" << ArrayRef<int>(args.params.padding, dim-2) << ", ";
+  ss <<   "stride=" << ArrayRef<int>(args.params.stride, dim-2) << ", ";
+  ss <<   "dilation=" << ArrayRef<int>(args.params.dilation, dim-2) << ", ";
+  ss <<   "groups=" << args.params.groups << ")\n";
+  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n";
+  ss << "out = net(data)\n";
+  ss << "out.backward(torch.randn_like(out))\n";
+  ss << "torch.cuda.synchronize()\n\n";
+  
+  return ss.str();
+}
+
+std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) {
+  out << repro_from_args(args)         // already has a trailing newline
+    << args.params                     // already has a trailing newline
+    << "input: " << args.idesc         // already has a trailing newline
+    << "output: " << args.odesc        // already has a trailing newline
+    << "weight: " << args.wdesc        // already has a trailing newline
+    << "Pointer addresses: " << "\n"
+    << "    input: " << args.input.data_ptr() << "\n"
+    << "    output: " << args.output.data_ptr() << "\n"
+    << "    weight: " << args.weight.data_ptr() << "\n";
+
+  return out;
+}
+
 // ---------------------------------------------------------------------
 //
 // Benchmarking
@@ -457,7 +530,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardAlgorithm_v7(
           args.handle,
           args.idesc.desc(),
           args.wdesc.desc(),
@@ -465,11 +538,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
           args.odesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
           args.wdesc.desc(), args.weight.data_ptr(),
@@ -479,7 +552,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -493,14 +566,14 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
     const ConvolutionArgs& args,
     algo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardWorkspaceSize(
         args.handle,
         args.idesc.desc(),
         args.wdesc.desc(),
         args.cdesc.desc(),
         args.odesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -527,7 +600,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataAlgorithm_v7(
           args.handle,
           args.wdesc.desc(),
           args.odesc.desc(),
@@ -535,11 +608,11 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
           args.idesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx(
           args.handle,
           args.wdesc.desc(), args.weight.data_ptr(),
           args.odesc.desc(), args.output.data_ptr(),
@@ -549,7 +622,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -563,14 +636,14 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
     const ConvolutionArgs& args,
     cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataWorkspaceSize(
         args.handle,
         args.wdesc.desc(),
         args.odesc.desc(),
         args.cdesc.desc(),
         args.idesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -599,7 +672,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     int perf_count;
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
           args.handle,
           args.idesc.desc(),
           args.odesc.desc(),
@@ -607,11 +680,11 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
           args.wdesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
           args.odesc.desc(), args.output.data_ptr(),
@@ -621,7 +694,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -633,14 +706,14 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
 
   static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterWorkspaceSize(
         args.handle,
         args.idesc.desc(),
         args.odesc.desc(),
         args.cdesc.desc(),
         args.wdesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -850,17 +923,18 @@ void raw_cudnn_convolution_forward_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionForward(
-        args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
-        args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory,
-        &zero, args.odesc.desc(), output.data_ptr()));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionForward(
+          args.handle,
+          &one, args.idesc.desc(), input.data_ptr(),
+          args.wdesc.desc(), weight.data_ptr(),
+          args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory,
+          &zero, args.odesc.desc(), output.data_ptr()),
+        args, "Forward algorithm: ", static_cast<int>(fwdAlgPerf.algo), "\n");
       }
   );
 }
@@ -986,17 +1060,22 @@ void raw_cudnn_convolution_backward_input_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionBackwardData(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardData(
           args.handle,
           &one, args.wdesc.desc(), weight.data_ptr(),
           args.odesc.desc(), grad_output.data_ptr(),
           args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data_ptr(), bwdDataAlgPerf.memory,
-          &zero, args.idesc.desc(), grad_input.data_ptr()));
+          &zero, args.idesc.desc(), grad_input.data_ptr()),
+        args,
+        "Additional pointer addresses: \n",
+        "    grad_output: ", grad_output.data_ptr(), "\n",
+        "    grad_input: ", grad_input.data_ptr(), "\n",
+        "Backward data algorithm: ", static_cast<int>(bwdDataAlgPerf.algo), "\n");
     }
   );
 }
@@ -1148,17 +1227,22 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardFilter(
           args.handle,
           &one, args.idesc.desc(), input.data_ptr(),
           args.odesc.desc(), grad_output.data_ptr(),
           args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data_ptr(), bwdFilterAlgPerf.memory,
-          &zero, args.wdesc.desc(), grad_weight.data_ptr()));
+          &zero, args.wdesc.desc(), grad_weight.data_ptr()),
+        args,
+        "Additional pointer addresses: \n",
+        "    grad_output: ", grad_output.data_ptr(), "\n",
+        "    grad_weight: ", grad_weight.data_ptr(), "\n",
+        "Backward filter algorithm: ", static_cast<int>(bwdFilterAlgPerf.algo), "\n");
     }
   );
 }
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 5be7d6eea8ea..aa99490deb2d 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -587,25 +587,60 @@ namespace {
     }
   }
 
-  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
-      cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-      const int64_t bsize = tensors.mini_batch;
-      //excluding Turing from using persistent rnn.
-      if (prop->major == 7 && prop->minor != 5 && getCudnnDataType(input) == CUDNN_DATA_HALF && !tensors.is_input_packed()) {
-          if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && rnn.num_directions() == 1 &&
-                  rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){
-              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
-              //weed them out
-              if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){
-                  if ((tensors.seq_length >=40 && bsize <=128) ||
-                     (tensors.seq_length >=20 && bsize <=96) ||
-                     (tensors.seq_length >=10 && bsize <=32)) {
-                     return CUDNN_RNN_ALGO_PERSIST_STATIC;
-                  }
-              }
-          }
+  inline bool use_persist_common_heuristics(const RNNDescriptorParams& rnn,
+                                            const TensorDescriptorListParams& tensors) {
+    return rnn.num_layers == 1 &&
+           rnn.hidden_size <= 1024 &&
+           rnn.num_directions() == 1 &&
+           rnn.hidden_size % 128 == 0 &&
+           tensors.input_size % 128 == 0;
+  }
+
+  inline bool use_persist_device_heuristics(const RNNDescriptorParams& rnn,
+                                            const TensorDescriptorListParams& tensors) {
+    auto bsize = tensors.mini_batch;
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    if (prop->major == 7) {
+      if (prop->minor == 5) {
+        // Excludes Turing from using persistent rnn.
+        return false;
+      } else {
+        // technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
+        // weed them out
+        return ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8) &&
+               ((tensors.seq_length >=40 && bsize <=128) ||
+                (tensors.seq_length >=20 && bsize <=96) ||
+                (tensors.seq_length >=10 && bsize <=32));
+      }
+    } else if (prop->major >= 8) {
+      // Based on tests by Vasily Volkov and xwang233.  Vasily only tried bsize <= 128,
+      // so conservatively enable persistence for bsize <= 128 only.
+      // TODO:  Run more tests for bsize > 128.
+      if (rnn.mode == CUDNN_GRU) {
+        // Persistent GRU performance is flakier than other RNN types.  Exclude them for now.
+        // TODO:  Write a more refined GRU heuristic.
+        return false;
+      } else if (rnn.mode == CUDNN_LSTM) {
+        // Persistent LSTMs are comparable to or better than non-persistent for bsize <= 128.
+        return bsize <= 128;
+      } else {
+        // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND hidden size >= 896.
+        return (bsize <= 128) && (bsize < 96 || rnn.hidden_size < 896);
       }
-      return CUDNN_RNN_ALGO_STANDARD;
+    } else {
+      return false;
+    }
+  }
+
+  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) {
+    if (getCudnnDataType(input) == CUDNN_DATA_HALF &&
+        !tensors.is_input_packed()) {
+      if (use_persist_common_heuristics(rnn, tensors) &&
+          use_persist_device_heuristics(rnn, tensors)) {
+        return CUDNN_RNN_ALGO_PERSIST_STATIC;
+      }
+    }
+    return CUDNN_RNN_ALGO_STANDARD;
   }
 
   cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 229e54a9ce62..beb4d940363e 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -133,5 +133,29 @@ Tensor group_norm(
 DEFINE_DISPATCH(GroupNormKernel);
 DEFINE_DISPATCH(GroupNormBackwardKernel);
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> math_group_norm(
+    const at::Tensor& input, const at::Tensor& weight,
+    const at::Tensor& bias, int64_t N, int64_t C, int64_t HxW,
+    int64_t group, double eps) {
+  auto input_shape = input.sizes();
+  at::Tensor input_reshaped = input.view({1, N * group, N ? -1 : 1});
+  auto outputs = at::native_batch_norm(
+      input_reshaped, /*weight=*/{}, /*bias=*/{}, /*running_mean=*/{},
+      /*running_var=*/{}, /*training=*/true, /*momentum=*/0, eps);
+  at::Tensor out = std::get<0>(outputs);
+  out = out.view(input_shape);
+  std::vector<int64_t> affine_param_shape(input.dim(), 1);
+  affine_param_shape[1] = C;
+  if (weight.defined() && bias.defined()) {
+    out = bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1);
+  } else if (weight.defined()) {
+    out = out.mul(weight.view(affine_param_shape));
+  } else if (bias.defined()) {
+    out = out.add(bias.view(affine_param_shape));
+  }
+  at::Tensor mean = std::get<1>(outputs).view({N, group});
+  at::Tensor rstd = std::get<2>(outputs).view({N, group});
+  return std::make_tuple(out, mean, rstd);
+}
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6541e45b3230..c27cb4083ac2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -167,13 +167,13 @@
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-     CUDA: fused_dropout_cuda
+    CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
   use_c10_dispatcher: full
   variants: function
   dispatch:
-     CUDA: masked_scale_cuda
+    CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -226,6 +226,8 @@
   variants: function, method
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: abs_out
 
 # Note [Adding an alias]
 # To add an alias do the following:
@@ -268,6 +270,8 @@
   variants: function, method
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: angle_out
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
@@ -277,6 +281,17 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: sgn(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sgn_out
+
 - func: real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function
@@ -290,6 +305,8 @@
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -304,6 +321,8 @@
   variants: function, method
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acos_out
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -378,12 +397,18 @@
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv_
 
 - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addmv_out
 
 - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -412,8 +437,12 @@
 - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: all
 
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: all_out
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -427,8 +456,12 @@
 - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: any
 
 - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: any_out
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -480,6 +513,8 @@
   variants: function, method
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acosh_out
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
@@ -501,6 +536,8 @@
   variants: function, method
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asinh_out
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
@@ -522,6 +559,8 @@
   variants: function, method
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atanh_out
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
@@ -553,8 +592,14 @@
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: asin_
+    SparseCPU, SparseCUDA: asin_sparse_
 
 - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asin_out
+    SparseCPU, SparseCUDA: asin_out_sparse
 
 # arcsin, alias of asin
 - func: arcsin(Tensor self) -> Tensor
@@ -576,6 +621,8 @@
   variants: function, method
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan_out
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
@@ -661,12 +708,18 @@
 
 - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: bernoulli_out
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -871,6 +924,8 @@
   variants: function, method
 
 - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_out
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
   use_c10_dispatcher: full
@@ -881,6 +936,8 @@
   variants: function, method
 
 - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_max_out
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
   use_c10_dispatcher: full
@@ -891,6 +948,8 @@
   variants: function, method
 
 - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_min_out
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -911,12 +970,16 @@
   variants: function
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   use_c10_dispatcher: full
@@ -990,6 +1053,8 @@
   variants: function, method
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1000,6 +1065,8 @@
   variants: function, method
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
@@ -1007,6 +1074,8 @@
 - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: count_nonzero
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
   use_c10_dispatcher: full
@@ -1183,7 +1252,7 @@
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
-    CPU:  ctc_loss_cpu
+    CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
@@ -1449,6 +1518,8 @@
   variants: function, method
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1459,6 +1530,8 @@
   variants: function, method
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1469,6 +1542,8 @@
   variants: function, method
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1479,6 +1554,8 @@
   variants: function, method
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1593,6 +1670,8 @@
   variants: function, method
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -1611,6 +1690,8 @@
     CPU: from_file
 
 - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1621,6 +1702,8 @@
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1716,6 +1799,7 @@
   use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: native_group_norm
+    Math: math_group_norm
 
 - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
@@ -1759,6 +1843,8 @@
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
   variants: function, method
+  dispatch:
+    CPU, CUDA: index
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
   # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
   # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
@@ -1791,6 +1877,8 @@
 
 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: _index_put_impl_
 
 - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
   use_c10_dispatcher: full
@@ -1893,6 +1981,16 @@
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
 
+- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
@@ -1988,12 +2086,16 @@
     CPU, CUDA: log2_out
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2086,6 +2188,8 @@
 - func: matrix_exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: matrix_exp
 
 - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
   use_c10_dispatcher: full
@@ -2115,6 +2219,8 @@
   variants: function, method
 
 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU, CUDA: max_out
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -2131,6 +2237,8 @@
   variants: function, method
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amax_out
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -2152,6 +2260,11 @@
   dispatch:
     MkldnnCPU: mkldnn_max_pool3d
 
+- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    QuantizedCPU: quantized_max_pool1d
+
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   use_c10_dispatcher: full
   dispatch:
@@ -2202,6 +2315,8 @@
   variants: function, method
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU, CUDA: min_out
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -2213,6 +2328,8 @@
   variants: function, method
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amin_out
 
 - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   use_c10_dispatcher: full
@@ -2333,6 +2450,8 @@
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: mode
 
 - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
@@ -2528,18 +2647,26 @@
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_forward
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_backward
 
 - func: pdist(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_forward
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_backward
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
   use_c10_dispatcher: full
@@ -2708,6 +2835,8 @@
   variants: function, method
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2716,8 +2845,14 @@
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: neg_
+    SparseCPU, SparseCUDA: neg_sparse_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: neg_out
+    SparseCPU, SparseCUDA: neg_out_sparse
 
 # Alias for neg
 - func: negative(Tensor self) -> Tensor
@@ -2835,10 +2970,14 @@
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink_backward
 
 - func: rsqrt(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2888,6 +3027,8 @@
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2909,6 +3050,8 @@
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
@@ -2923,6 +3066,8 @@
     CPU, CUDA: logit_
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2945,6 +3090,8 @@
   variants: function, method
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sinh_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
 # This method is OK to call if the `Variable` is a view.
@@ -3119,27 +3266,39 @@
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
 - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sum_out
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
 
 - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
 
 - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
 
 - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nansum_out
 
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
   use_c10_dispatcher: full
@@ -3155,6 +3314,8 @@
   variants: function, method
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3167,23 +3328,33 @@
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
 
 - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: std_out
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -3193,12 +3364,18 @@
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: prod_out
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
@@ -3224,6 +3401,8 @@
   variants: function, method
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3237,6 +3416,8 @@
   variants: function, method
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   use_c10_dispatcher: full
@@ -3350,6 +3531,8 @@
   variants: function, method
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: trunc_out
 
 # Alias for trunc
 - func: fix(Tensor self) -> Tensor
@@ -3428,12 +3611,18 @@
 - func: var(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -3443,10 +3632,14 @@
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
@@ -3482,6 +3675,8 @@
 - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _s_where
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
   use_c10_dispatcher: full
@@ -3584,8 +3779,8 @@
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-      SparseCPU: _sparse_sum_backward_cpu
-      SparseCUDA: _sparse_sum_backward_cuda
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -3598,11 +3793,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -3615,11 +3812,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
   use_c10_dispatcher: full
@@ -3638,8 +3837,12 @@
   variants: function, method
 
 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   variants: function, method
@@ -3748,6 +3951,8 @@
 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: rsub
 
 - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -4197,6 +4402,8 @@
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
@@ -4205,6 +4412,8 @@
 - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
 
 - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
@@ -4213,6 +4422,8 @@
 - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_channel_affine
 
 - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
@@ -4221,6 +4432,8 @@
 - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine
 
 - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
@@ -4234,6 +4447,10 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
+  use_c10_dispatcher: full
+  variants: function
+
 # to(Device) must not exist because all constructors of Device also works for
 # TensorOptions. Otherwise, an ambiguity error is thrown.
 # See NOTE [ TensorOptions Constructors ].
@@ -4787,6 +5004,8 @@
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4805,6 +5024,8 @@
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4894,27 +5115,43 @@
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: uniform_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: geometric_
 
 # wrappers for TH functions
 
@@ -4933,10 +5170,14 @@
   device_guard: False
 
 - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cross_out
 
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: cross
 
 - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5271,15 +5512,15 @@
 
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: legacy::cpu::_th_take_out
-    CUDA: legacy::cuda::_th_take_out
+    CPU: take_out_cpu
+    CUDA: take_out_cuda
 
 - func: take(Tensor self, Tensor index) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: legacy::cpu::_th_take
-    CUDA: legacy::cuda::_th_take
+    CPU: take_cpu
+    CUDA: take_cuda
 
 - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
   use_c10_dispatcher: full
@@ -5368,6 +5609,8 @@
   use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5378,6 +5621,8 @@
   variants: method
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5597,12 +5842,18 @@
     CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: digamma
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5635,6 +5886,8 @@
   variants: function, method
 
 - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5662,10 +5915,14 @@
   variants: method, function
 
 - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: atan2
 
 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5728,19 +5985,27 @@
     CUDA: fmod_cuda
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: hypot
 
 - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
 
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: nextafter
 
 - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
@@ -5782,8 +6047,12 @@
 - func: maximum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: maximum
 
 - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: maximum_out
 
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
@@ -5796,8 +6065,12 @@
 - func: minimum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: minimum
 
 - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: minimum_out
 
 # binary min, alias for minimum
 # NOTE: min is not an alias for minimum, since there is also unary min
@@ -5878,6 +6151,8 @@
 - func: all(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: all
 
 - func: any(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5953,18 +6228,32 @@
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: normal_
 
 - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -5974,23 +6263,6 @@
   use_c10_dispatcher: full
   variants: method, function
 
-- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr
-    CUDA: addr_cuda
-
-- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr_
-    CUDA: addr__cuda
-
-- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: legacy::cpu::_th_addr_out
-    CUDA: addr_out_cuda
-
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
   use_c10_dispatcher: full
   dispatch:
@@ -6029,11 +6301,12 @@
   dispatch:
     CPU: legacy::cpu::_th_std
 
-- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
   use_c10_dispatcher: full
+  device_guard: False
   variants: function
   dispatch:
-    CUDA: _amp_non_finite_check_and_unscale_cuda_
+    CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
 
 - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
   use_c10_dispatcher: full
@@ -6063,6 +6336,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6070,6 +6344,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6077,6 +6352,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6084,6 +6360,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6091,6 +6368,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6098,6 +6376,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6105,34 +6384,39 @@
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
-- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
-- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6140,6 +6424,7 @@
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6147,13 +6432,15 @@
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
-- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6161,13 +6448,79 @@
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
+- func: _foreach_add_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+
+- func: _foreach_sub_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+
+- func: _foreach_div_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+
+- func: _foreach_mul_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6175,6 +6528,7 @@
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6182,6 +6536,7 @@
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6189,6 +6544,7 @@
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6196,6 +6552,7 @@
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6203,6 +6560,7 @@
     CUDA: foreach_tensor_addcdiv_cuda_
 
 - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6210,6 +6568,7 @@
     CUDA: foreach_tensor_addcmul_cuda_
 
 - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6217,6 +6576,7 @@
     CUDA: foreach_tensor_addcdiv_cuda
 
 - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6272,10 +6632,14 @@
 
 - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_out
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss
 
 - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6429,23 +6793,25 @@
     CPU: nll_loss2d_backward_cpu
     CUDA: legacy::cuda::_thnn_nll_loss2d_backward
 
-- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_out
     CUDA: smooth_l1_loss_out
 
-- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
@@ -6465,10 +6831,14 @@
 
 - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6478,6 +6848,8 @@
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6511,6 +6883,8 @@
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6522,6 +6896,8 @@
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6550,6 +6926,8 @@
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6560,14 +6938,20 @@
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6586,11 +6970,13 @@
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu
-    QuantizedCPU: heaky_relu_quantized_cpu
+    QuantizedCPU: leaky_relu_quantized_cpu
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6656,10 +7042,14 @@
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus
 
 - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6669,13 +7059,19 @@
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward
 
 - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink
 
 - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6685,6 +7081,8 @@
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward
 
 - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7447,6 +7845,8 @@
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: sigmoid_backward
 
 - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7456,6 +7856,8 @@
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: logit_backward
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7465,6 +7867,8 @@
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: tanh_backward
 
 # What's a thnn_conv_ versus a slow_conv_?
 #
@@ -7787,6 +8191,46 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
 - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 5efec6420906..8ae92a0d3bec 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -232,5 +232,96 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
   return std::make_tuple(q_params.scale, q_params.zero_point);
 }
 
+float calculate_quant_loss(
+    const float* input,
+    int numel,
+    float xmin,
+    float xmax,
+    float* q_input,
+    int bit_width) {
+  xmin = static_cast<at::Half>(xmin);
+  float data_range = xmax - xmin;
+  float qmax = (1 << bit_width) - 1;
+  float scale = data_range == 0
+      ? 1.0
+      : static_cast<float>(static_cast<at::Half>(data_range / qmax));
+  float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
+
+  float norm = 0.0f;
+  int i = 0;
+
+  // TODO add FBGEMM kernel
+  // #ifdef USE_FBGEMM
+  // #endif
+
+  // remainder loop
+  for (; i < numel; i++) {
+    q_input[i] = std::max(
+        0.0f, std::min<float>(nearbyint((input[i] - xmin) * inverse_scale), qmax));
+    q_input[i] = q_input[i] * scale + xmin;
+    norm += (input[i] - q_input[i]) * (input[i] - q_input[i]);
+  }
+  return std::sqrt(norm);
+}
+
+/*
+  Helper function to find the best min/max for a tensor to calculate qparams.
+  It uses a greedy approach to nudge the min and max and calculate the l2 norm
+  and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))`
+  Returns the optimized xmax and xmin value of the tensor.
+*/
+std::tuple<Tensor, Tensor> choose_qparams_optimized(
+    const at::Tensor& input_tensor,
+    int64_t numel,
+    const int64_t n_bins,
+    const double ratio,
+    int64_t bit_width) {
+
+  const float* input_row = input_tensor.data_ptr<float>();
+  float xmin = *std::min_element(input_row, input_row + numel);
+  float xmax = *std::max_element(input_row, input_row + numel);
+
+  float stepsize = (xmax - xmin) / n_bins;
+  int min_bins = n_bins * (1.0 - (float) ratio);
+  const float* input = input_tensor.contiguous().data_ptr<float>();
+  std::vector<float> q_input(numel);
+
+  float loss =
+      calculate_quant_loss(input, numel, xmin, xmax, q_input.data(), bit_width);
+  float best_loss = loss;
+
+  float cur_min = xmin;
+  float cur_max = xmax;
+  float cur_loss = loss;
+
+  float thr = min_bins * stepsize;
+  while (cur_min + thr < cur_max) {
+    // move left
+    float loss1 = calculate_quant_loss(
+        input, numel, cur_min + stepsize, cur_max, q_input.data(), bit_width);
+    // move right
+    float loss2 = calculate_quant_loss(
+        input, numel, cur_min, cur_max - stepsize, q_input.data(), bit_width);
+    if (cur_loss < loss1 && cur_loss < loss2 && cur_loss < best_loss) {
+      // found a local optima
+      best_loss = cur_loss;
+      xmin = cur_min;
+      xmax = cur_max;
+    }
+    if (loss1 < loss2) {
+      cur_min = cur_min + stepsize;
+      cur_loss = loss1;
+    } else {
+      cur_max = cur_max - stepsize;
+      cur_loss = loss2;
+    }
+  }
+
+  at::Tensor xmax_tensor = at::empty({1});
+  at::Tensor xmin_tensor = at::empty({1});
+  xmax_tensor[0] = xmax;
+  xmin_tensor[0] = xmin;
+  return std::make_tuple(xmax_tensor, xmin_tensor);
+}
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp
index cbf116d741e3..1d0aed1174aa 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp
@@ -17,6 +17,8 @@ DEFINE_DISPATCH(quantize_tensor_per_channel_float_qparams_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_channel_affine_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_channel_float_qparams_stub);
+DEFINE_DISPATCH(quantize_tensor_per_tensor_affine_sub_byte_stub);
+DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_sub_byte_stub);
 
 namespace {
 
@@ -55,7 +57,8 @@ void checkQuantizedTensor(const std::string& fn_name, Tensor t) {
       fn_name,
       " expects a ",
       caffe2::TypeMeta::Make<T>(),
-      " Tensor");
+      " Tensor, got ",
+      t.scalar_type());
 }
 
 template <typename T>
@@ -103,13 +106,21 @@ Tensor quantize_tensor_per_tensor_affine(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     checkZeroPoint<underlying_t>(fn_name, zero_point);
   });
 
-  quantize_tensor_per_tensor_affine_stub(
+  // Temporary solution to pack the tensor if dtype is torch.quint4x2
+  // Can move this into the fbgemm::Quantize op.
+  if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) {
+    quantize_tensor_per_tensor_affine_sub_byte_stub(
+      rtensor.device().type(), rtensor, qtensor, scale, zero_point);
+  }
+  else {
+    quantize_tensor_per_tensor_affine_stub(
       rtensor.device().type(), rtensor, qtensor, scale, zero_point);
+  }
   return qtensor;
 }
 
@@ -163,7 +174,7 @@ Tensor quantize_tensor_per_channel_float_qparams(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
   });
 
@@ -195,13 +206,18 @@ Tensor dequantize_tensor_per_tensor_affine(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     checkZeroPoint<underlying_t>(fn_name, zero_point);
   });
 
-  dequantize_tensor_per_tensor_affine_stub(
-      qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) {
+    dequantize_tensor_per_tensor_affine_sub_byte_stub(
+        qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  } else {
+    dequantize_tensor_per_tensor_affine_stub(
+        qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  }
   return rtensor;
 }
 
@@ -253,7 +269,7 @@ Tensor dequantize_tensor_per_channel_float_qparams(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
   });
 
@@ -394,17 +410,13 @@ CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value) {
 * Note: For the case of embedding quantization we will set zero_point
 * to (-Xmin/scale), where Xmin is the min value in input tensor row.
 */
-template <typename T>
-T quantize_val_float_qparams(float scale, float zero_point, float value) {
-  int64_t qvalue;
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax) {
+  int qvalue;
 
-  // TODO make sure qmax and qmin for dtypes other than int8, uint8 is correctly defined.
-  constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min();
-  constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max();
   float inv_scale = scale == 0 ? 1.0f : 1.0f / scale;
   qvalue = lrintf(value * inv_scale + zero_point);
   qvalue = std::max(qmin, std::min(qvalue, qmax));
-  return static_cast<T>(qvalue);
+  return qvalue;
 }
 
 template <typename SRC_T, typename DST_T>
@@ -491,11 +503,5 @@ requantize_from_int<quint8>(double, int64_t, int64_t);
 template CAFFE2_API qint32
 requantize_from_int<qint32>(double, int64_t, int64_t);
 
-template CAFFE2_API qint8
-quantize_val_float_qparams<qint8>(float scale, float zero_point, float value);
-template CAFFE2_API quint8
-quantize_val_float_qparams<quint8>(float scale, float zero_point, float value);
-template CAFFE2_API qint32
-quantize_val_float_qparams<qint32>(float scale, float zero_point, float value);
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.h b/aten/src/ATen/native/quantized/affine_quantizer.h
index 862a36f5f61a..670b119652cd 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.h
+++ b/aten/src/ATen/native/quantized/affine_quantizer.h
@@ -77,6 +77,12 @@ using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
     Tensor zero_points,
     int64_t axis);
 
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(Tensor rtensor, Tensor qtensor, float scale, float zero_point);
+
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(Tensor qtensor, Tensor rtensor, float scale, float zero_point);
+
 DECLARE_DISPATCH(
     quantize_tensor_per_tensor_affine_fn,
     quantize_tensor_per_tensor_affine_stub);
@@ -97,6 +103,13 @@ DECLARE_DISPATCH(
     dequantize_tensor_per_channel_float_qparams_fn,
     dequantize_tensor_per_channel_float_qparams_stub);
 
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
 
 // Quantize a float value into a uint value given scale and zero_point
 template <typename T>
@@ -145,8 +158,7 @@ template <typename DST_T>
 CAFFE2_API DST_T
 requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
 
-template <typename T>
-CAFFE2_API T quantize_val_float_qparams(float scale, float zero_point, float value);
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
index 65036302e6ef..29e7a9b259bb 100644
--- a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
+++ b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
@@ -10,17 +10,29 @@ namespace native {
 // format of the output the same as input
 Tensor int_repr_quantized_cpu(const Tensor& self) {
   Tensor dst;
-  AT_DISPATCH_QINT_TYPES(self.scalar_type(), "int_repr", [&]() {
-    dst = at::empty(
-        self.sizes(),
-        self.options().dtype(UNDERLYING_TYPE),
-        self.suggest_memory_format());
-    auto iter = TensorIteratorConfig()
-      .check_all_same_dtype(false)
-      .add_output(dst)
-      .add_input(self)
-      .build();
-    cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; });
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(self.scalar_type(), "int_repr", [&]() {
+    if (bit_width == 4) {
+      int64_t out_size = std::ceil(self.numel() * 0.5);
+      dst = at::empty(
+          {out_size},
+          self.options().dtype(UNDERLYING_TYPE),
+          self.suggest_memory_format());
+      const underlying_t* qdata = reinterpret_cast<underlying_t*>(self.data_ptr<scalar_t>());
+      for (int64_t i = 0; i < dst.numel(); ++i) {
+        dst[i] = static_cast<underlying_t>(qdata[i]);
+      }
+    } else {
+      dst = at::empty(
+          self.sizes(),
+          self.options().dtype(UNDERLYING_TYPE),
+          self.suggest_memory_format());
+      auto iter = TensorIteratorConfig()
+        .check_all_same_dtype(false)
+        .add_output(dst)
+        .add_input(self)
+        .build();
+      cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; });
+      }
   });
   return dst;
 }
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index ddde74b61d52..a65e9f00f1d8 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2592,7 +2592,8 @@ void dequantize_per_channel_affine_kernel(
       Tensor rtensor,
       Tensor scales,
       Tensor zero_points,
-      int64_t axis) {
+      int64_t axis,
+      int bit_width=8) {
 
   // For contiguous tensors, e.g. NCHW, arbitrary axis can be used.
   // For channels_last/3d however axis == 0 or 1.
@@ -2611,6 +2612,7 @@ void dequantize_per_channel_affine_kernel(
   check_tensor_memory_format(qtensor, rtensor);
   const auto* qd = qtensor.data_ptr<Q>();
   float* rd = rtensor.data_ptr<float>();
+  const auto elem_per_byte = 8 / bit_width;
   if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
       rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) {
     for (auto b = 0; b < batches; ++b) {
@@ -2619,8 +2621,12 @@ void dequantize_per_channel_affine_kernel(
           auto i = b * channel * elements_per_channel + e * channel + c;
           // We need to convert the qint8 value to float to ensure the
           // subtraction subexpression returns a float
-          rd[i] = (static_cast<float>(qd[i].val_) - zero_points_data[c]) *
-              scales_data[c];
+          auto qvalue = qd[i / elem_per_byte].val_;
+          if (bit_width < 8) {
+            qvalue >>= (i % elem_per_byte) * bit_width;
+            qvalue &= (1 << bit_width) - 1;
+          }
+          rd[i] = (static_cast<float>(qvalue) - zero_points_data[c]) * scales_data[c];
         }
       }
     }
@@ -2632,8 +2638,12 @@ void dequantize_per_channel_affine_kernel(
               c * elements_per_channel + e;
           // We need to convert the qint8 value to float to ensure the
           // subtraction subexpression returns a float
-          rd[i] = (static_cast<float>(qd[i].val_) - zero_points_data[c]) *
-              scales_data[c];
+          auto qvalue = qd[i / elem_per_byte].val_;
+          if (bit_width < 8) {
+            qvalue >>= (i % elem_per_byte) * bit_width;
+            qvalue &= (1 << bit_width) - 1;
+          }
+          rd[i] = (static_cast<float>(qvalue) - zero_points_data[c]) * scales_data[c];
         }
       }
     }
@@ -2667,7 +2677,7 @@ void quantize_tensor_per_channel_float_qparams_cpu(
   TORCH_CHECK(rtensor.is_contiguous() || (axis <=1),
       "If tensor is channels_last contig then per channel quantization "
       "is supported only for axis = 0 or 1.");
-  AT_DISPATCH_QINT_TYPES(
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
       qtensor.scalar_type(), "quantize_tensor_per_channel_float_qparams_cpu", [&]() {
         int64_t batches = size_to_dim_(axis, rtensor.sizes());
         int64_t elements_per_channel =
@@ -2677,15 +2687,22 @@ void quantize_tensor_per_channel_float_qparams_cpu(
         auto zero_points_data = zero_points.data_ptr<float>();
         check_tensor_memory_format(rtensor, qtensor);
         const float* rdata = rtensor.data_ptr<float>();
-        auto qdata = qtensor.data_ptr<scalar_t>();
+        auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+        const auto elem_per_byte = CHAR_BIT / bit_width;
+        int qvalue = 0;
         if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
             rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) {
           for (auto b = 0; b < batches; ++b) {
             for (auto e = 0; e < elements_per_channel; ++e) {
               for (auto c = 0; c < channel; ++c) {
                 auto i = b * channel * elements_per_channel + e * channel + c;
-                qdata[i] = quantize_val_float_qparams<scalar_t>(
-                    scales_data[c], zero_points_data[c], rdata[i]);
+                qvalue = quantize_val_float_qparams(
+                    scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max);
+                if (i % elem_per_byte == 0) {
+                  qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+                } else {
+                  qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+                }
               }
             }
           }
@@ -2695,8 +2712,13 @@ void quantize_tensor_per_channel_float_qparams_cpu(
               for (auto e = 0; e < elements_per_channel; ++e) {
                 auto i = b * channel * elements_per_channel +
                     c * elements_per_channel + e;
-                qdata[i] = quantize_val_float_qparams<scalar_t>(
-                    scales_data[c], zero_points_data[c], rdata[i]);
+                qvalue = quantize_val_float_qparams(
+                    scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max);
+                if (i % elem_per_byte == 0) {
+                  qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+                } else {
+                  qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+                }
               }
             }
           }
@@ -2710,12 +2732,66 @@ void dequantize_tensor_per_channel_float_qparams_cpu(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  AT_DISPATCH_QINT_TYPES(
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
       qtensor.scalar_type(), "dequantize_tensor_per_channel_float_qparams_cpu", [&]() {
-        dequantize_per_channel_affine_kernel<float, float, scalar_t>(qtensor, rtensor, scales, zero_points, axis);
+        dequantize_per_channel_affine_kernel<float, float, scalar_t>(qtensor, rtensor, scales, zero_points, axis, bit_width);
       });
 }
 
+void quantize_tensor_per_tensor_affine_sub_byte_cpu(
+    Tensor rtensor,
+    Tensor qtensor,
+    float scale,
+    float zero_point) {
+  // TODO Use fbgemm kernel to pack values
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
+    qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
+      check_tensor_memory_format(rtensor, qtensor);
+      const float* const rdata = rtensor.data_ptr<float>();
+      auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+      auto numel = rtensor.numel();
+      const auto elem_per_byte = CHAR_BIT / bit_width;
+      for (int i = 0; i < numel; ++i) {
+        float inv_scale = scale == 0 ? 1.0f : 1.0f / scale;
+        int qvalue = lrintf(std::nearbyint(rdata[i] * inv_scale) + zero_point);
+        qvalue = std::max(quant_min, std::min(qvalue, quant_max));
+
+        // We pack sub_byte values and align them to a byte.
+        // Eg. for 4-bits Index 0 is packed in the lower 4-bits
+        // and index 1 is packed in the upper 4-bits.
+        if (i % elem_per_byte == 0) {
+          qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+        } else {
+          qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+        }
+      } // for numel
+    });
+}
+
+void dequantize_tensor_per_tensor_affine_sub_byte_cpu(
+    Tensor qtensor,
+    Tensor rtensor,
+    float scale,
+    float zero_point) {
+  // TODO Use fbgemm kernel to pack values
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
+    qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
+      check_tensor_memory_format(rtensor, qtensor);
+      auto rdata = rtensor.data_ptr<float>();
+      const underlying_t* qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+      auto numel = rtensor.numel();
+      const auto elem_per_byte = CHAR_BIT / bit_width;
+
+      for (int i = 0; i < numel; ++i) {
+        underlying_t qvalue = qdata[i / elem_per_byte];
+        qvalue >>= (i % elem_per_byte) * bit_width;
+        qvalue &= (1 << bit_width) - 1;
+        rdata[i] = (static_cast<float>(qvalue) - zero_point) * scale;
+      }
+  });
+
+}
+
 } // namespace
 
 REGISTER_DISPATCH(dequantize_tensor_per_channel_affine_stub,
@@ -2773,6 +2849,13 @@ REGISTER_DISPATCH(
 REGISTER_DISPATCH(quantized_normalize_stub, &quantized_normalize_kernel);
 REGISTER_DISPATCH(qupsample_bilinear2d_nhwc_stub,
                   &qupsample_bilinear2d_nhwc_kernel);
+REGISTER_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_stub,
+    &quantize_tensor_per_tensor_affine_sub_byte_cpu);
+REGISTER_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_stub,
+    &dequantize_tensor_per_tensor_affine_sub_byte_cpu);
+
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
index 22db20eeedb6..a12718502dd1 100644
--- a/aten/src/ATen/native/quantized/cpu/qadd.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -266,29 +266,29 @@ Tensor qadd_scalar_tensor_out(Tensor qa, Tensor b, Tensor out) {
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("add",                 TORCH_FN(qadd</*ReLUFused=*/false>));
-  m.impl("add.out",             TORCH_FN(qadd_out</*ReLUFused=*/false>));
-  m.impl("add.Scalar",          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
-  m.impl("add.Scalar_out",      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
-  m.impl("add_relu",            TORCH_FN(qadd</*ReLUFused=*/true>));
-  m.impl("add_relu.out",        TORCH_FN(qadd_out</*ReLUFused=*/true>));
-  m.impl("add_relu.Scalar",     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
-  m.impl("add_relu.Scalar_out", TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add"),                 TORCH_FN(qadd</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"),            TORCH_FN(qadd</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
   // deprecated functions, kept for backward compatibility
-  m.impl("add_out",             TORCH_FN(qadd_out</*ReLUFused=*/false>));
-  m.impl("add_relu_out",        TORCH_FN(qadd_out</*ReLUFused=*/true>));
-  m.impl("add_scalar",          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu",     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
-  m.impl("add_scalar_out",      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu_out", TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
-  m.impl("add_scalar.Tensor",   TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu.Tensor", TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/true>));
-  m.impl("add_scalar_out.Tensor", TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu_out.Tensor", TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu_out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar.Tensor"),   TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu.Tensor"), TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/true>));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("add", TORCH_FN(qadd</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::add"), TORCH_FN(qadd</*ReLUFused=*/false>));
 }
 
 }  // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
index effafcacc76e..b053940abba2 100644
--- a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
@@ -378,14 +378,14 @@ Tensor quantized_batch_norm(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("batch_norm",        TORCH_FN(q_batch_norm_impl<false>));
-  m.impl("batch_norm_relu",   TORCH_FN(q_batch_norm_impl<true>));
-  m.impl("batch_norm1d",      TORCH_FN(q_batch_norm1d_impl<false>));
-  m.impl("batch_norm1d_relu", TORCH_FN(q_batch_norm1d_impl<true>));
-  m.impl("batch_norm2d",      TORCH_FN(q_batch_norm2d_impl<false>));
-  m.impl("batch_norm2d_relu", TORCH_FN(q_batch_norm2d_impl<true>));
-  m.impl("batch_norm3d",      TORCH_FN(q_batch_norm3d_impl<false>));
-  m.impl("batch_norm3d_relu", TORCH_FN(q_batch_norm3d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm"),        TORCH_FN(q_batch_norm_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm_relu"),   TORCH_FN(q_batch_norm_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d"),      TORCH_FN(q_batch_norm1d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d_relu"), TORCH_FN(q_batch_norm1d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d"),      TORCH_FN(q_batch_norm2d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d_relu"), TORCH_FN(q_batch_norm2d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d"),      TORCH_FN(q_batch_norm3d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d_relu"), TORCH_FN(q_batch_norm3d_impl<true>));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
index a70016307785..3a8b647d320f 100644
--- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@@ -140,7 +140,7 @@ Tensor& hardtanh_quantized_cpu_(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("clamp", TORCH_FN(clamp_quantized_cpu));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::clamp"), TORCH_FN(clamp_quantized_cpu));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
index 0656f40e3554..ca08c365d83d 100644
--- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
@@ -102,10 +102,10 @@ Tensor qcat_out(const c10::List<Tensor>& qxs, int64_t dim, Tensor out) {
 } // namespace
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("cat", TORCH_FN(qcat<false>));
-  m.impl("cat_relu", TORCH_FN(qcat<true>));
-  m.impl("cat_out", TORCH_FN(qcat_out<false>));
-  m.impl("cat_relu_out", TORCH_FN(qcat_out<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat"), TORCH_FN(qcat<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu"), TORCH_FN(qcat<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_out"), TORCH_FN(qcat_out<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu_out"), TORCH_FN(qcat_out<true>));
 }
 
 Tensor cat_quantized_cpu(TensorList qxs, int64_t dim) {
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 12563eb36d44..cb232a5d20c3 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -277,6 +277,12 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
                                             : "quantized::conv";
   TORCH_CHECK(
       fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  TORCH_CHECK(
+    !transpose(),
+    "FBGEMM currently does NOT support transposed convolution. ",
+    "Meanwhile you have multiple options: 1) Replace the ConvTranspose with ",
+    "the 'dequant->conv_tranpose->quant'; 2) Change the current qengine to "
+    "QNNPACK using 'torch.backends.quantized.engine = \"qnnpack\"'.");
   ConvDimChecks<kSpatialDim>(
       act.ndimension(), stride().size(), padding().size(),
       output_padding().size(), dilation().size(), func_name, transpose());
@@ -850,30 +856,30 @@ class QConvInt8ForBC final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("conv1d",          QConv1dInt8<false>::run);
-  m.impl("conv1d_relu",     QConv1dInt8<true>::run);
-  m.impl("conv2d.new",      QConvInt8<2, false>::run);
-  m.impl("conv2d_relu.new", QConvInt8<2, true>::run);
-  m.impl("conv3d.new",      QConvInt8<3, false>::run);
-  m.impl("conv3d_relu.new", QConvInt8<3, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"),          QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"),     QConv1dInt8<true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"),      QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d.new"),      QConvInt8<3, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu.new"), QConvInt8<3, true>::run);
   // for backward compatibility
-  m.impl("conv2d", QConvInt8ForBC<2, false>::run);
-  m.impl("conv2d_relu", QConvInt8ForBC<2, true>::run);
-  m.impl("conv3d", QConvInt8ForBC<3, false>::run);
-  m.impl("conv3d_relu", QConvInt8ForBC<3, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d"), QConvInt8ForBC<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu"), QConvInt8ForBC<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d"), QConvInt8ForBC<3, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu"), QConvInt8ForBC<3, true>::run);
 
   // transpose
-  m.impl("conv_transpose1d",  QConv1dInt8<false>::run);
-  m.impl("conv_transpose2d",  QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d"),  QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d"),  QConvInt8<2, false>::run);
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("conv2d",      QConvInt8<2, false>::run);
-  m.impl("conv2d_relu", QConvInt8<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d"),      QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_relu"), QConvInt8<2, true>::run);
 
   // transpose
-  m.impl("conv_transpose1d",  QConv1dInt8<false>::run);
-  m.impl("conv_transpose2d",  QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d"),  QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d"),  QConvInt8<2, false>::run);
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 4387b255dfe1..7bf84c9d5646 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -415,21 +415,21 @@ class QConv1dPackWeightInt8 final {
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // Conv
   // conv_prepack is deprecated, please use conv2d_prepack for 2D conv.
-  m.impl("conv_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
-  m.impl("conv1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_conv));
-  m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
-  m.impl("conv3d_prepack", TORCH_FN(QConvPackWeightInt8<3>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_prepack"), TORCH_FN(QConvPackWeightInt8<3>::run_conv));
   // ConvTranspose
-  m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv));
-  m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
   // Conv
-  m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
   // ConvTranspose
-  m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv));
-  m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
index 9e8a103cb17c..0886fdc7342e 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
@@ -243,36 +243,36 @@ class QConvTranspose final {
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
-  m.impl("conv_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
   // We use  conv2d_unpack to be consistent with conv3d_unpack
-  m.impl("conv1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl("conv2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-  m.impl("conv3d_unpack", TORCH_FN(QConvUnpackWeightsInt8<3>::run));
-
-  m.impl("conv2d_stride", TORCH_FN(QConvStride<2>::run));
-  m.impl("conv2d_padding", TORCH_FN(QConvPadding<2>::run));
-  m.impl("conv2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl("conv2d_dilation", TORCH_FN(QConvDilation<2>::run));
-  m.impl("conv2d_groups", TORCH_FN(QConvGroups<2>::run));
-  m.impl("conv2d_transpose", TORCH_FN(QConvTranspose<2>::run));
-
-  m.impl("conv3d_stride", TORCH_FN(QConvStride<3>::run));
-  m.impl("conv3d_padding", TORCH_FN(QConvPadding<3>::run));
-  m.impl("conv3d_output_padding", TORCH_FN(QConvOutputPadding<3>::run));
-  m.impl("conv3d_dilation", TORCH_FN(QConvDilation<3>::run));
-  m.impl("conv3d_groups", TORCH_FN(QConvGroups<3>::run));
-  m.impl("conv3d_transpose", TORCH_FN(QConvTranspose<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
 
   // ConvTranspose is the same, however, we want to have different name.
-  m.impl("conv_transpose1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl("conv_transpose2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-
-  m.impl("conv_transpose2d_stride", TORCH_FN(QConvStride<2>::run));
-  m.impl("conv_transpose2d_padding", TORCH_FN(QConvPadding<2>::run));
-  m.impl("conv_transpose2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl("conv_transpose2d_dilation", TORCH_FN(QConvDilation<2>::run));
-  m.impl("conv_transpose2d_groups", TORCH_FN(QConvGroups<2>::run));
-  m.impl("conv_transpose2d_transpose", TORCH_FN(QConvTranspose<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qelu.cpp b/aten/src/ATen/native/quantized/cpu/qelu.cpp
index 92b635471e78..e873506026e6 100644
--- a/aten/src/ATen/native/quantized/cpu/qelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qelu.cpp
@@ -24,8 +24,8 @@ Tensor quantized_celu(const Tensor& qx, double output_scale, int64_t output_zero
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("elu", quantized_elu);
-  m.impl("celu", quantized_celu);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::elu"), quantized_elu);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::celu"), quantized_celu);
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index da494936aad7..cb82d9aee469 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -451,14 +451,12 @@ class QEmbedding final {
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   // Function that works on TorchBind packed weights.
-  m.impl("embedding_bag_byte", TORCH_FN(QEmbeddingBag<8>::run));
-  m.impl("embedding_byte", TORCH_FN(QEmbedding<8>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte"), TORCH_FN(QEmbeddingBag<8>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_byte"), TORCH_FN(QEmbedding<8>::run));
 
   // Functions that work on at::Tensor packed weight.
-  m.impl(
-      "embedding_bag_byte_rowwise_offsets", embedding_bag_byte_rowwise_offsets);
-  m.impl(
-      "embedding_bag_4bit_rowwise_offsets", embedding_bag_4bit_rowwise_offsets);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_rowwise_offsets"), embedding_bag_byte_rowwise_offsets);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_rowwise_offsets"), embedding_bag_4bit_rowwise_offsets);
 }
 } // namespace
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index dc1f26345e62..e94f0be0d802 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -104,8 +104,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
       embedding_rows,
       embedding_cols +
           8}; // extra 8 bytes to store FP scale and zero_point per row.
-  size_t output_columns = output_shape[1];
-  constexpr float kEpsilon = 1e-8f;
 
   // Allocate output packed weights
   auto output = at::empty(
@@ -114,6 +112,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
       weight_contig.suggest_memory_format());
   auto* output_data = output.data_ptr<uint8_t>();
 
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat(
+      weight_data, embedding_rows, embedding_cols, output_data);
+#else
+  size_t output_columns = output_shape[1];
+  constexpr float kEpsilon = 1e-8f;
   for (std::size_t row = 0; row < embedding_rows; ++row) {
     const float* input_row = weight_data + row * embedding_cols;
     std::uint8_t* output_row = output_data + row * output_columns;
@@ -134,10 +138,15 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
           lrintf((input_row[col] - minimum_element) * inverse_scale);
     } // embedding_cols
   } // embedding_rows
+#endif // USE_FBGEMM
+
   return output;
 }
 
-Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
+Tensor _qembeddingbag_nbit_prepack_helper(
+    const Tensor& weight,
+    int bit_width,
+    bool optimized_qparams) {
   int64_t embedding_rows = weight.size(0);
   int64_t embedding_cols = weight.size(1);
 
@@ -145,16 +154,16 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
 
   const auto weight_data = weight.data_ptr<float>();
   TORCH_CHECK(
-    BIT_RATE == 4 || BIT_RATE == 2,
-    "BIT_RATE must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'."
-    "For 8bit, consider using 'embedding_bag_byte_prepack'.");
+      bit_width == 4 || bit_width == 2,
+      "bit_width must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'."
+      "For 8bit, consider using 'embedding_bag_byte_prepack'.");
 
-  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
+  int NUM_ELEM_PER_BYTE = 8 / bit_width;
   TORCH_CHECK(
       weight_contig.size(weight.dim() - 1) % NUM_ELEM_PER_BYTE == 0,
-      "qembeddingbag_" + c10::to_string(BIT_RATE) +
-      "bit_prepack only works for the number of columns a multiple of "
-      + c10::to_string(NUM_ELEM_PER_BYTE));
+      "qembeddingbag_" + c10::to_string(bit_width) +
+          "bit_prepack only works for the number of columns a multiple of " +
+          c10::to_string(NUM_ELEM_PER_BYTE));
 
   // The "fused" representation stores the scale and bias with the
   // row-wise quantized data in one tensor.
@@ -172,55 +181,75 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
       weight_contig.options().dtype(at::kByte),
       weight_contig.suggest_memory_format());
   auto* output_data = output.data_ptr<uint8_t>();
-  const auto output_columns = output.size(output.dim() - 1);
-
-  for (int row = 0; row < embedding_rows; ++row) {
-    const float* input_row = weight_data + row * embedding_cols;
-    std::uint8_t* output_row = output_data + row * output_columns;
 
-    float Xmin = *std::min_element(input_row, input_row + embedding_cols);
-    float Xmax = *std::max_element(input_row, input_row + embedding_cols);
-
-    Xmin = static_cast<at::Half>(Xmin);
-    const float range = Xmax - Xmin;
-
-    // Set scale to 1.0f for the corner case of Xmax == Xmin .
-    // Any non-zero scale would work because during quantization
-    // (X - Xmin) / scale will be 0 for all X unless scale is 0.
-    at::Half scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1);
-    float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
-    if (scale == 0 || std::isinf(inverse_scale)) {
-      // Corner case handling when Xmax == Xmin
-      // Any scale would work because X - Xmin will be 0 for all X
-      scale = 1.0f;
-      inverse_scale = 1.0f;
-    }
-
-    // Update the scale and zero_point of each row.
-    at::Half* output_row_scale_zp = reinterpret_cast<at::Half*>(
-        output_row +
-        (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-
-    output_row_scale_zp[0] = scale;
-    output_row_scale_zp[1] = Xmin;
-
-    // Pack the weight values.
-    for (int col = 0; col < embedding_cols; ++col) {
-      float X = input_row[col];
-      std::uint8_t quantized = std::max(
-          0,
-          std::min<int>(
-              lrintf((X - Xmin) * inverse_scale), (1 << BIT_RATE) - 1));
-      // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits
-      // and index 1 is packed in the upper 4-bits.
-      if (col % NUM_ELEM_PER_BYTE == 0) {
-        output_row[col / NUM_ELEM_PER_BYTE] = quantized;
+#ifdef USE_FBGEMM
+  if (!optimized_qparams) {
+    fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf(
+        bit_width, weight_data, embedding_rows, embedding_cols, output_data);
+  } else {
+#endif // USE_FBGEMM
+    const auto output_columns = output.size(output.dim() - 1);
+
+    for (int row = 0; row < embedding_rows; ++row) {
+      const float* input_row = weight_data + row * embedding_cols;
+      std::uint8_t* output_row = output_data + row * output_columns;
+
+      float Xmin, Xmax;
+      if (optimized_qparams) {
+        at::Tensor xmax_tensor, xmin_tensor;
+        std::tie(xmax_tensor, xmin_tensor) = at::choose_qparams_optimized(
+            weight_contig[row], embedding_cols, 200, 0.16, bit_width);
+        TORCH_CHECK(
+            xmax_tensor.numel() == 1 && xmin_tensor.numel() == 1,
+            "Expected choose_qparams_optimized to return min/max tensors of size 1");
+        Xmax = xmax_tensor.item<float>();
+        Xmin = xmin_tensor.item<float>();
       } else {
-        output_row[col / NUM_ELEM_PER_BYTE] |=
-            (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE));
+        Xmin = *std::min_element(input_row, input_row + embedding_cols);
+        Xmax = *std::max_element(input_row, input_row + embedding_cols);
       }
-    } // embedding_cols
-  } // embedding_rows
+      Xmin = static_cast<at::Half>(Xmin);
+      float range = Xmax - Xmin;
+      // Set scale to 1.0f for the corner case of Xmax == Xmin .
+      // Any non-zero scale would work because during quantization
+      // (X - Xmin) / scale will be 0 for all X unless scale is 0.
+      at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1);
+      float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
+      if (scale == 0 || std::isinf(inverse_scale)) {
+        // Corner case handling when Xmax == Xmin
+        // Any scale would work because X - Xmin will be 0 for all X
+        scale = 1.0f;
+        inverse_scale = 1.0f;
+      }
+      // Update the scale and zero_point of each row.
+      at::Half* output_row_scale_zp = reinterpret_cast<at::Half*>(
+          output_row +
+          (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
+
+      output_row_scale_zp[0] = scale;
+      output_row_scale_zp[1] = Xmin;
+
+      // Pack the weight values.
+      for (int col = 0; col < embedding_cols; ++col) {
+        float X = input_row[col];
+        std::uint8_t quantized = std::max(
+            0,
+            std::min<int>(
+                lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1));
+        // We pack 2 4-bit values in a byte. Index 0 is packed in the lower
+        // 4-bits and index 1 is packed in the upper 4-bits.
+        if (col % NUM_ELEM_PER_BYTE == 0) {
+          output_row[col / NUM_ELEM_PER_BYTE] = quantized;
+        } else {
+          output_row[col / NUM_ELEM_PER_BYTE] |=
+              (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width));
+        }
+      } // embedding_cols
+    } // embedding_rows
+#ifdef USE_FBGEMM
+  }
+#endif // USE_FBGEMM
+
   return output;
 }
 
@@ -231,8 +260,11 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
 // To later de-quantize values, the scale (range / 15) and zero_point
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
-Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
-  return _qembeddingbag_nbit_prepack_helper(weight, 4 /*BIT_RATE*/);
+Tensor qembeddingbag_4bit_prepack(
+    const Tensor& weight,
+    bool optimized_qparams) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 4 /*bit_width*/, optimized_qparams);
 }
 
 // Applies 2-bit row-wise quantization by determining the range
@@ -243,8 +275,11 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
 // TODO() - Add 2Bit Embedding Lookup operator.
-Tensor qembeddingbag_2bit_prepack(const Tensor& weight) {
-  return _qembeddingbag_nbit_prepack_helper(weight, 2 /*BIT_RATE*/);
+Tensor qembeddingbag_2bit_prepack(
+    const Tensor& weight,
+    bool optimized_qparams) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 2 /*bit_width*/, optimized_qparams);
 }
 
 class QEmbeddingPackWeights final {
@@ -255,13 +290,13 @@ class QEmbeddingPackWeights final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("embedding_bag_byte_prepack", qembeddingbag_byte_prepack);
-  m.impl("embedding_bag_4bit_prepack", qembeddingbag_4bit_prepack);
-  m.impl("embedding_bag_2bit_prepack", qembeddingbag_2bit_prepack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"), TORCH_FN(qembeddingbag_byte_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"), TORCH_FN(qembeddingbag_4bit_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"), TORCH_FN(qembeddingbag_2bit_prepack));
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("embedding_bag_prepack", TORCH_FN(QEmbeddingPackWeights::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_prepack"), TORCH_FN(QEmbeddingPackWeights::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
index 72d42c61d0e5..ca3d9dc71c7e 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -73,6 +73,10 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
       packed_weight.suggest_memory_format());
   float* output_data = output.data_ptr<float>();
 
+#ifdef USE_FBGEMM
+  fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat(
+      input, input_rows, input_columns, output_data);
+#else
   for (std::size_t row = 0; row < input_rows; ++row) {
     const std::uint8_t* input_row = input + row * input_columns;
     const float* input_row_scale_zp =
@@ -84,14 +88,17 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
           input_row[col] * input_row_scale_zp[0] + input_row_scale_zp[1];
     } // output_columns
   } // input_rows
+#endif // USE_FBGEMM
   return output;
 }
 
-Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) {
+Tensor _qembeddingbag_nbit_unpack_helper(
+    const Tensor& packed_weight,
+    int BIT_RATE) {
   const auto input_rows = packed_weight.size(0);
   const auto input_columns = packed_weight.size(1);
   const auto* input_data = packed_weight.data_ptr<uint8_t>();
-  int NUM_ELEM_PER_BYTE = 8/BIT_RATE;
+  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
 
   // The last 4 bytes per row are two fp16 scale and zero_point.
   // The rest of input_columns is the number of values in the original row.
@@ -105,6 +112,10 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
       packed_weight.options().dtype(kFloat),
       packed_weight.suggest_memory_format());
   float* output_data = output.data_ptr<float>();
+#ifdef USE_FBGEMM
+  fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat(
+      BIT_RATE, input_data, input_rows, input_columns, output_data);
+#else
   auto output_columns = output_dimensions[1];
   for (size_t row = 0; row < input_rows; ++row) {
     float* output_row = output_data + row * output_columns;
@@ -122,6 +133,8 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
       output_row[col] = scale * quantized + zero_point;
     } // output_columns
   } // input_rows
+#endif // USE_FBGEMM
+
   return output;
 }
 
@@ -158,15 +171,15 @@ class QEmbeddingUnpackWeights final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("embedding_bag_byte_unpack", qembeddingbag_byte_unpack);
-  m.impl("embedding_bag_4bit_unpack", qembeddingbag_4bit_unpack);
-  m.impl("embedding_bag_2bit_unpack", qembeddingbag_2bit_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_unpack"), qembeddingbag_byte_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_unpack"), qembeddingbag_4bit_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_unpack"), qembeddingbag_2bit_unpack);
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   // Unpack the packed embedding_bag weights using TorchBind custom class.
   // TODO extend to support 4-bit qtensor.
-  m.impl("embedding_bag_unpack", TORCH_FN(QEmbeddingUnpackWeights::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_unpack"), TORCH_FN(QEmbeddingUnpackWeights::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
index f0dbd644b2be..064b88a8c91f 100644
--- a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
@@ -85,7 +85,7 @@ Tensor quantized_hardswish(const Tensor& qx, double output_scale, int64_t output
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("hardswish", TORCH_FN(quantized_hardswish));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::hardswish"), TORCH_FN(quantized_hardswish));
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index fdc6d1dd4d8b..a7b4f4b74357 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -397,12 +397,12 @@ class QLinearInt8 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("linear", TORCH_FN(QLinearInt8<false>::run));
-  m.impl("linear_relu", TORCH_FN(QLinearInt8<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8<true>::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("linear", TORCH_FN(QLinearInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 2accf060deab..af2d7749ee50 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -455,13 +455,13 @@ class QLinearDynamicFp16 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8<false>::run));
-  m.impl("linear_relu_dynamic", TORCH_FN(QLinearDynamicInt8<true>::run));
-  m.impl("linear_dynamic_fp16", TORCH_FN(QLinearDynamicFp16<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic"), TORCH_FN(QLinearDynamicInt8<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16<false>::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
-  m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8<false>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index ee4b6ee2aaf6..23912f87d123 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -318,22 +318,22 @@ class QLinearPackWeightFp16Legacy final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run));
-  m.impl("linear_prepack_legacy", TORCH_FN(QLinearPackWeightInt8Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_legacy"), TORCH_FN(QLinearPackWeightInt8Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run));
-  m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
-  m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run));
-  m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
index 1bc8711a22f4..ecbae04dd957 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
@@ -137,13 +137,13 @@ class QLinearUnpackWeightFp16Legacy final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_unpack.legacy", TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
-  m.impl("linear_unpack_fp16.legacy", TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
-  m.impl("linear_unpack", TORCH_FN(QLinearUnpackWeightInt8::run));
-  m.impl("linear_unpack_fp16", TORCH_FN(QLinearUnpackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index 13aa8acc669a..deeae36dc502 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -173,26 +173,26 @@ class QMulScalarTensorOut final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("mul",                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
-  m.impl("mul.out",             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
-  m.impl("mul.Scalar",          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
-  m.impl("mul.Scalar_out",      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
-  m.impl("mul_relu",            TORCH_FN(QMul</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.out",        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.Scalar",     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.Scalar_out", TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul"),                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar"),          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar_out"),      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu"),            TORCH_FN(QMul</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.out"),        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar"),     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar_out"), TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
   // deprecated functions, kept for backward compatibility
-  m.impl("mul_out",             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
-  m.impl("mul_relu_out",        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar",          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu",     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar_out",      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu_out", TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu_out"),        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar"),          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu"),     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out"),      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out"), TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
   // TODO: remove after broadcasting is supported
-  m.impl("mul_scalar.Tensor", TORCH_FN(QMulScalarTensor</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu.Tensor", TORCH_FN(QMulScalarTensor</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar_out.Tensor", TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu_out.Tensor", TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar.Tensor"), TORCH_FN(QMulScalarTensor</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu.Tensor"), TORCH_FN(QMulScalarTensor</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out.Tensor"), TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out.Tensor"), TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/true>::run));
 }
 
 }  // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
index f5bef2b93a0a..6ed193cd82c9 100644
--- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
@@ -120,7 +120,7 @@ Tensor quantized_instance_norm_impl(
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // TODO: this is kind of... blegh
-  m.impl("layer_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::layer_norm"), [](
     Tensor input,
     std::vector<int64_t> normalized_shape,  // because IntArrayRef doesn't work
     c10::optional<Tensor> weight,
@@ -134,7 +134,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
           bias.has_value() ? *bias : Tensor(),
           eps, output_scale, output_zero_point);
   });
-  m.impl("group_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::group_norm"), [](
       Tensor qx,
       int64_t num_groups,
       c10::optional<Tensor> weight,
@@ -148,7 +148,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
         bias.has_value() ? *bias : Tensor(),
         eps, output_scale, output_zero_point);
   });
-  m.impl("instance_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::instance_norm"), [](
       Tensor qx,
       c10::optional<Tensor> weight,
       c10::optional<Tensor> bias,
diff --git a/aten/src/ATen/native/quantized/cpu/qpool.cpp b/aten/src/ATen/native/quantized/cpu/qpool.cpp
index f986ab4934b9..7fa56619609b 100644
--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@@ -134,7 +134,12 @@ Tensor q_maxpool_2d(
   int64_t oC = iC;
   int64_t oH = pooling_output_shape(iH, kH, pH, sH, dH, ceil_mode);
   int64_t oW = pooling_output_shape(iW, kW, pW, sW, dW, ceil_mode);
-  TORCH_CHECK(oH > 0 && oW > 0, "the resulting Tensor is too small.");
+  TORCH_CHECK(oH > 0 && oW > 0,
+              "Given input size: (",
+              iC, "x", iH, "x", iW,
+              "). Calculated output size: (",
+              oC, "x", oH, "x", oW,
+              "). Output size is too small.");
 
   std::vector<int64_t> oSizes;
   if (ndim == 3) {
@@ -232,7 +237,7 @@ void check_maxpool2d_params(
 }
 
 #ifdef USE_PYTORCH_QNNPACK
- static Tensor qnnpack_maxpool(
+ static Tensor qnnpack_maxpool2d(
      Tensor input,
      IntArrayRef kernel_size,
      IntArrayRef stride,
@@ -243,23 +248,23 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        input.ndimension() == 4,
-       "qnnpack_maxpool(): Expected input to be 4-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected input to be 4-dimensional: got ",
        input.ndimension());
    TORCH_CHECK(
        kernel_size.size() == 2,
-       "qnnpack_maxpool(): Expected kernel_size to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected kernel_size to be 2-dimensional: got ",
        kernel_size.size());
    TORCH_CHECK(
        stride.size() == 2,
-       "qnnpack_maxpool(): Expected stride to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected stride to be 2-dimensional: got ",
        stride.size());
    TORCH_CHECK(
        dilation.size() == 2,
-       "qnnpack_maxpool(): Expected dilation to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected dilation to be 2-dimensional: got ",
        dilation.size());
    TORCH_CHECK(
        padding.size() == 2,
-       "qnnpack_maxpool(): Expected padding to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected padding to be 2-dimensional: got ",
        padding.size());
 
    int64_t batch_size = input.size(0);
@@ -284,10 +289,10 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        kH > 0 && kW > 0,
-       "qnnpack_maxpool(): kernel_size should be greater than zero.");
+       "qnnpack_maxpool2d(): kernel_size should be greater than zero.");
    TORCH_CHECK(
        strideH > 0 && strideW > 0,
-       "qnnpack_maxpool(): strides should be greater than zero.");
+       "qnnpack_maxpool2d(): strides should be greater than zero.");
 
    const pytorch_qnnp_status createStatus =
        pytorch_qnnp_create_max_pooling2d_nhwc_u8(
@@ -318,7 +323,7 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        outH > 0 && outW > 0,
-       "qnnpack_maxpool(): the resulting output Tensor size should be >= 0");
+       "qnnpack_maxpool2d(): the resulting output Tensor size should be >= 0");
 
    std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
        qnnpack_uniq_ptr(qnnpack_operator);
@@ -375,7 +380,7 @@ Tensor quantized_max_pool2d(
   }
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) {
-    return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    return qnnpack_maxpool2d(qx, kernel_size, stride, padding, dilation, ceil_mode);
   }
 #endif
   Tensor qy;
@@ -395,9 +400,37 @@ Tensor quantized_max_pool2d(
   return qy;
 }
 
+// Quantized max_pool1d is a special case of the max_pool2d, with one of the
+// dimensions and kernels removed.
+Tensor quantized_max_pool1d(
+    const Tensor& qx,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+  // (C, L) -> (C, 1, L) => kSqueezeDim = 1
+  // (N, C, L) -> (N, C, 1, L) => kSqueezeDim = 2
+  const int32_t kSqueezeDim = qx.dim() - 1;
+  const auto qx_unsqueeze = qx.unsqueeze(kSqueezeDim);
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  auto qy = at::quantized_max_pool2d(
+    qx.unsqueeze(kSqueezeDim),
+    {1, kernel_size[0]},
+    {1, stride[0]},
+    {0, padding[0]},
+    {1, dilation[0]},
+    ceil_mode);
+  qy = qy.squeeze(kSqueezeDim);
+  return qy;
+}
+
 // Keep the registry in the anonymous namespace.
 namespace {
-class QMaxPool2D_arr_args final {
+template <uint32_t kSpatialDim>
+class QMaxPool_arr_args final {
  public:
   static Tensor run(
       Tensor qx,
@@ -406,17 +439,20 @@ class QMaxPool2D_arr_args final {
       std::vector<int64_t> padding,
       std::vector<int64_t> dilation,
       bool ceil_mode) {
-    #ifdef USE_PYTORCH_QNNPACK
-    if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) {
-      return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    if (kSpatialDim == 1) {
+      return at::quantized_max_pool1d(qx, kernel_size, stride, padding,
+                                      dilation, ceil_mode);
+    } else if (kSpatialDim == 2) {
+      return at::quantized_max_pool2d(qx, kernel_size, stride, padding,
+                                      dilation, ceil_mode);
     }
-    #endif
-    return at::max_pool2d(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    TORCH_CHECK(false, "MaxPool", kSpatialDim, "D is not supported.");
   }
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("max_pool2d", TORCH_FN(QMaxPool2D_arr_args::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool1d"), TORCH_FN(QMaxPool_arr_args<1>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qreduction.cpp b/aten/src/ATen/native/quantized/cpu/qreduction.cpp
index 739638b7a67e..74b266114230 100644
--- a/aten/src/ATen/native/quantized/cpu/qreduction.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qreduction.cpp
@@ -83,7 +83,14 @@ Tensor& mean_out_quantized_cpu(
     c10::optional<ScalarType> opt_dtype) {
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
-      self.scalar_type() == kQUInt8) {
+      self.scalar_type() == kQUInt8 &&
+      // QNNPACK currently is only supported for NCHW + dim=(2, 3)
+      // Remove these checks after generic version is implemented.
+      self.ndimension() == 4 &&
+      dim.size() == 2 &&
+      dim[0] == 2 &&
+      dim[1] == 3
+     ){
     result = qnnpack_mean(self, dim);
     return result;
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
index 447e5cb23af5..ca03081a1a25 100644
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@@ -113,7 +113,7 @@ Tensor& leaky_relu_out_quantized_cpu(Tensor& result, const Tensor& self,
   return result;
 }
 
-Tensor heaky_relu_quantized_cpu(const Tensor& self, Scalar negval) {
+Tensor leaky_relu_quantized_cpu(const Tensor& self, Scalar negval) {
   const auto qx = self.contiguous(self.suggest_memory_format());
   auto qy = at::_empty_affine_quantized(qx.sizes(),
       at::device(kCPU).dtype(self.scalar_type()),
@@ -170,8 +170,27 @@ class QRelu6 final {
   }
 };
 
+class QLeakyRelu final {
+ public:
+  static Tensor run(Tensor self, Scalar negative_slope, bool inplace, double output_scale, int64_t output_zero_point) {
+    // inplace argument is ignored now, TODO:support inplace
+    if (inplace) {
+      TORCH_WARN("inplace=True is not supported for quantized::leaky_relu yet");
+    }
+    const auto qx = self.contiguous(self.suggest_memory_format());
+    auto qy = at::_empty_affine_quantized(qx.sizes(),
+      at::device(kCPU).dtype(self.scalar_type()),
+      output_scale,
+      output_zero_point,
+      self.suggest_memory_format());
+    qrelu_leaky_stub(self.device().type(), qy, qx, negative_slope);
+    return qy;
+  }
+};
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("relu6", TORCH_FN(QRelu6::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::leaky_relu"), TORCH_FN(QLeakyRelu::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
index 281274d27be2..a42da4081c71 100644
--- a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
@@ -35,7 +35,7 @@ Tensor threshold_quantized_cpu(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("threshold", TORCH_FN(threshold_quantized_cpu));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::threshold"), TORCH_FN(threshold_quantized_cpu));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index c8e247b42365..dceb06b05d4a 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -20,174 +20,158 @@ TORCH_LIBRARY(quantized, m) {
   register_conv_params<3>();
   register_embedding_params();
 
-  m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add.Scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
-  m.def("add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   // TODO: remove after broadcasting is supported
-  m.def("add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc");
-  m.def("add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc");
-  m.def("add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"));
   // This is needed for graph mode quantization, when we fuse
   // dequant - aten::batch_norm - quant into quantized::batch_norm
   // and dimension is unknown given only the aten op call
   // quantized::batch_norm supports both 2d and 3d batch norm right now
   // it should also support 1d batch_norm after quantized::batch_norm1d is
   // implemented
-  m.def("batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy");
-  m.def("threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy");
-  m.def("cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor");
-  m.def("cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor");
-  m.def("cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)");
-  m.def("cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)");
-  m.def("conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
   // conv_prepack is deprecated, please use conv2d_prepack for 2D conv.
-  m.def("conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase"));
   // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
-  m.def("conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"));
   // conv_tranpsose
-  m.def("conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
 
-  m.def("elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor");
-  m.def("embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack");
-  m.def("embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin");
-  m.def("embedding_bag_byte_prepack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_4bit_prepack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_2bit_prepack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor");
-  m.def("celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor");
-  m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor");
-  m.def("group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor");
-  m.def("instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def(
-      "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y");
-  m.def(
-      "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def(
-      "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def("mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc");
-  m.def("mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul.Scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc");
-  m.def("mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
-  m.def("mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   // TODO: remove after broadcasting is supported
-  m.def("mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc");
-  m.def("mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc");
-  m.def("mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
-  // NB: missing a space after comma here...
-  m.def("max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation,bool ceil_mode) -> Tensor");
-  m.def("relu6(Tensor qx, bool inplace=False) -> Tensor");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::leaky_relu(Tensor qx, Scalar negative_slope, bool inplace, float output_scale, int output_zero_point) -> Tensor"));
 }
 
 // According to #33294: The "_" prefix registration will be
 // removed when the operators are all migrated to mobile.
 // https://github.com/pytorch/pytorch/issues/36510
 TORCH_LIBRARY(_quantized, m) {
-  m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def(
-      "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def(
-      "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
 }
diff --git a/aten/src/ATen/native/sparse/ParamUtils.cpp b/aten/src/ATen/native/sparse/ParamUtils.cpp
new file mode 100644
index 000000000000..f2a4c97571b9
--- /dev/null
+++ b/aten/src/ATen/native/sparse/ParamUtils.cpp
@@ -0,0 +1,53 @@
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ATen.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name) {
+  TORCH_INTERNAL_ASSERT(input_.is_sparse());
+  TORCH_CHECK(
+      !half_to_float,
+      std::string(function_name) +
+          ": with half to float conversion is not supported on " +
+          input_.device().str());
+  auto input = input_.coalesce();
+  Tensor output = at::native::empty_like(input);
+  TORCH_CHECK(
+      dim_ >= 0 && dim_ < input.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  return std::make_pair(input, output);
+}
+
+std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name) {
+  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
+  checkSameSize(function_name, grad_arg, output_arg);
+
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+
+  auto grad = grad_.coalesce();
+  auto output = output_.coalesce();
+
+  Tensor grad_input = at::native::empty_like(output);
+  TORCH_CHECK(
+      dim >= 0 && dim < grad.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(
+      grad.sparse_dim() == output.sparse_dim(),
+      ": grad and output sparse dimensions must be equal");
+  return std::make_tuple(grad_input, grad, output);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h
new file mode 100644
index 000000000000..c9b2e3d999ad
--- /dev/null
+++ b/aten/src/ATen/native/sparse/ParamUtils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+TORCH_API std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name);
+
+TORCH_API std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 1544c6e499e7..6070faf635c5 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -4,6 +4,7 @@
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/sparse/ParamUtils.h>
 #include <map>
 
 namespace at {
@@ -291,10 +292,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
   if (dim >= sparse_dim) {
     if (LogSoftMax) {
       auto new_values = log_softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     } else {
       auto new_values = softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     }
     return;
   }
@@ -411,17 +412,27 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
   auto grad_offsets = get_offsets(grad_indices, sizes, -1);
 
   if (dim >= sparse_dim) {
-    for(int64_t i=0; i<out_nnz; i++) {
-      Tensor unused;
-      auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
-      auto j = low - grad_offsets.begin();
-      if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
-        if (LogSoftMax) {
-          auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
-        } else {
-          auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
+    Tensor unused;
+    if (out_offsets == grad_offsets) {
+      if (LogSoftMax) {
+        auto r = log_softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      } else {
+        auto r = softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      }
+    } else {
+      for(int64_t i=0; i<out_nnz; i++) {
+        auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
+        auto j = low - grad_offsets.begin();
+        if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
+          if (LogSoftMax) {
+            auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          } else {
+            auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          }
         }
       }
     }
@@ -503,36 +514,36 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
     });
 }
 
-} // namespace
+} // anonymous namespace
 
-Tensor softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim);
   });
   return output;
 }
 
-Tensor log_softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "log_softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor log_softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "log_softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim);
   });
   return output;
 }
@@ -542,26 +553,16 @@ Tensor softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
   AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
+    cpu_sparse_coo_softmax_backward<scalar_t, false>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
@@ -571,26 +572,16 @@ Tensor log_softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("log_softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "log_softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
+    cpu_sparse_coo_softmax_backward<scalar_t, true>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 9eee5e056dff..2bb5842b4726 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -95,16 +95,17 @@ SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scal
 // log1p(SparseTensor)
 // --------------------------------------------------------------------
 
-// TODO: add in-place variant
+// In-place log1p on uncoalesced tensors is not supported since the operation is not a linear map.
+// Values of uncoalesced tensor corresponding to the same indices are summed
+// and log1p(summed_value) != log1p(v1) + log1p(v2)
 
 SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
-  AT_ASSERT(r.is_sparse());
-  AT_ASSERT(t.is_sparse());
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
 
   if (is_same_tensor(r, t)) {
     // don't have in-place log1p for uncoalesced input because coalesce() is not in-place
-    TORCH_CHECK(
-      r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
+    TORCH_CHECK(r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported");
   }
   else {
     copy_sparse_to_sparse_(r, t.coalesce());
@@ -114,10 +115,53 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
 }
 
 SparseTensor& log1p_sparse_(SparseTensor& t) {
-  TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   return log1p_out_sparse(t, t);
 }
 
+// --------------------------------------------------------------------
+// neg(SparseTensor)
+// --------------------------------------------------------------------
+
+SparseTensor& neg_out_sparse(SparseTensor& r, const SparseTensor& t) {
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
+
+  // copy_sparse_ does not perform the copy if it is the same tensor
+  copy_sparse_to_sparse_(r, t);
+  r._values().neg_();
+  return r;
+}
+
+SparseTensor& neg_sparse_(SparseTensor& t) {
+  return neg_out_sparse(t, t);
+}
+
+// --------------------------------------------------------------------
+// asin(SparseTensor)
+// --------------------------------------------------------------------
+
+// In-place asin on uncoalesced tensors is not supported since the operation is not a linear map.
+// Values of uncoalesced tensor corresponding to the same indices are summed
+// and asin(summed_value) != asin(v1) + asin(v2)
+
+SparseTensor& asin_out_sparse(SparseTensor& r, const SparseTensor& t) {
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
+
+  if (is_same_tensor(r, t)) {
+    // don't have in-place asin for uncoalesced input because coalesce() is not in-place, see above comment
+    TORCH_CHECK(r.is_coalesced(), "asin: in-place on uncoalesced tensors is not supported");
+  } else {
+    copy_sparse_to_sparse_(r, t.coalesce());
+  }
+  r._values().asin_();
+  return r;
+}
+
+SparseTensor& asin_sparse_(SparseTensor& t) {
+  return asin_out_sparse(t, t);
+}
+
 // --------------------------------------------------------------------
 // pow(SparseTensor, Scalar)
 // --------------------------------------------------------------------
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
new file mode 100644
index 000000000000..26cb6aba04e0
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -0,0 +1,641 @@
+#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/SparseTensorUtils.h>
+#include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <ATen/native/sparse/SparseTensorMath.h>
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+
+#include <THC/THCTensorMathPointwise.cuh>
+#include <THC/THCThrustAllocator.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <bitset>
+
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+
+#include <c10/macros/Macros.h>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+
+#include <c10/cuda/CUDAMathCompat.h>
+
+namespace at {
+namespace native {
+namespace {
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+#if defined(__HIP_PLATFORM_HCC__)
+  int threadSizes[5] = {16, 32, 64, 128, 256};
+#else
+  int threadSizes[5] = {32, 64, 128, 256, 512};
+#endif
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return threadSizes[4];
+}
+
+template <typename scalar_t, bool LogSoftMax>
+__global__ void cuda_sparse_coo_softmax_kernel(
+    int64_t* sorted_pool_indices,
+    int64_t size,
+    int64_t* pool_sizes,
+    int64_t* pool_offsets,
+    int64_t nvalues,
+    scalar_t* mx_rows,
+    PackedTensorAccessor<scalar_t, 2> input_values_acc,
+    PackedTensorAccessor<scalar_t, 2> output_values_acc) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation of the sparse softmax algorithm that this implementation is
+    based on.
+  */
+  int tid = threadIdx.x;
+  int blkid = blockIdx.x;
+  int blksz = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int index = tid + blkid * blksz;
+  int step = blksz * gridsz;
+
+  while (index < size) {
+    int64_t offset = pool_offsets[index];
+    int64_t* pool_indices = sorted_pool_indices + offset;
+    int64_t pool_indices_size = pool_sizes[index];
+    scalar_t* mx_row = mx_rows + index * nvalues;
+
+    for (int64_t j = 0; j < nvalues; j++) {
+      scalar_t exp_sums = 0;
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto values_row = input_values_acc[i];
+        auto out_values_row = output_values_acc[i];
+
+        auto v = c10::cuda::compat::exp(values_row[j] - mx_row[j]);
+        if (!LogSoftMax) {
+          out_values_row[j] = v;
+        }
+        exp_sums += v;
+      }
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto values_row = input_values_acc[i];
+        auto out_values_row = output_values_acc[i];
+        
+        if (LogSoftMax) {
+          out_values_row[j] = values_row[j] - mx_row[j] - c10::cuda::compat::log(exp_sums);
+        } else {
+          out_values_row[j] *= 1.0 / exp_sums;
+        }
+      }
+    }
+    index += step;
+  }
+}
+
+template <typename scalar_t, bool LogSoftMax>
+__global__ void cuda_sparse_coo_softmax_backward_kernel(
+    int64_t* sorted_pool_indices,
+    int64_t size,
+    int64_t* pool_sizes,
+    int64_t* pool_offsets,
+    int64_t nvalues,
+    int64_t grad_nnz,
+    int64_t* grad_offsets,
+    int64_t* out_offsets,
+    int64_t* lower_bound_values,
+    PackedTensorAccessor<scalar_t, 2> values_accessor,
+    PackedTensorAccessor<scalar_t, 2> out_values_accessor,
+    PackedTensorAccessor<scalar_t, 2> grad_values_accessor) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for
+    the CPU implementation of the sparse softmax backward algorithm that this
+    implementation is based on.
+  */
+  int tid = threadIdx.x;
+  int blkid = blockIdx.x;
+  int blksz = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int index = tid + blkid * blksz;
+  int step = blksz * gridsz;
+
+  while (index < size) {
+    int64_t offset = pool_offsets[index];
+    int64_t* pool_indices = sorted_pool_indices + offset;
+    int64_t pool_indices_size = pool_sizes[index];
+
+    for (int64_t k = 0; k < nvalues; k++) {
+      scalar_t tmp_row{0};
+
+      /* Compute tmp = - sum_j output_j * grad_j */
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto out_values_row = out_values_accessor[i];
+        auto j = lower_bound_values[i];
+
+        /* Update `tmp_row` accumulator only when limits and pools are valid */
+        if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
+          auto grad_values_row = grad_values_accessor[j];
+          if (LogSoftMax) {
+            tmp_row -= grad_values_row[k];
+          } else {
+            tmp_row -= out_values_row[k] * grad_values_row[k];
+          }
+        }
+      }
+
+      /* Compute grad_input = output * (grad + tmp)*/
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto out_values_row = out_values_accessor[i];
+        auto values_row = values_accessor[i];
+        auto j = lower_bound_values[i];
+        if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
+          auto grad_values_row = grad_values_accessor[j];
+          if (LogSoftMax) {
+            values_row[k] = grad_values_row[k] +
+                c10::cuda::compat::exp(out_values_row[k]) * tmp_row;
+          } else {
+            values_row[k] =
+                out_values_row[k] * (grad_values_row[k] + tmp_row);
+          } 
+        } else {
+          if (LogSoftMax) {
+            values_row[k] =
+                c10::cuda::compat::exp(out_values_row[k]) * tmp_row;
+          } else {
+            values_row[k] = out_values_row[k] * tmp_row;
+          }
+        }
+      }
+    }
+    index += step;
+  }
+}
+
+using thrust_ptr = thrust::device_ptr<int64_t>;
+
+Tensor get_offsets(
+    const Tensor& indices,
+    const IntArrayRef& sizes,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:get_offsets for the CPU
+    implementation of get_offsets function that this implementation is based on.
+  */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto ndim = indices.size(0);
+  auto nnz = indices.size(1);
+  std::vector<int64_t> host_strides(ndim, 1);
+  if (ndim > 1) {
+    for (int64_t i = ndim - 2; i >= 0; i--) {
+      host_strides[i] =
+          host_strides[i + 1] * (i + 1 == dim ? 1 : sizes[i + 1]);
+    }
+  }
+  auto strides = at::empty({ndim}, indices.options());
+  auto strides_ptr = strides.data_ptr<int64_t>();
+
+  AT_CUDA_CHECK(cudaMemcpyAsync(
+          strides_ptr, host_strides.data(), host_strides.size() * sizeof(int64_t),
+          cudaMemcpyHostToDevice,
+          stream));
+
+  auto indices_accessor = indices.packed_accessor<int64_t, 2>();
+
+  Tensor offsets = at::empty({nnz}, indices.options());
+
+  thrust::transform(
+      policy,
+      thrust::make_counting_iterator(int64_t(0)),
+      thrust::make_counting_iterator(int64_t(nnz)),
+      thrust::device_ptr<int64_t>(offsets.data_ptr<int64_t>()),
+      [indices_accessor, strides_ptr, dim, ndim] __device__(int64_t x) {
+        int64_t pool_index = 0;
+        for (int64_t j = 0; j < ndim; j++) {
+          if (j != dim) {
+            auto indices_row = indices_accessor[j];
+            auto stride = strides_ptr[j];
+            pool_index += stride * indices_row[x];
+          }
+        }
+        return pool_index;
+      });
+  return offsets;
+}
+
+template <class scalar_t, bool requireMxRows = true>
+std::tuple<Tensor, Tensor, Tensor, Tensor> compute_pool_max(
+    const Tensor& indices,
+    const Tensor& values,
+    const IntArrayRef& sizes,
+    int64_t nvalues,
+    const int64_t dim) {
+  /*
+    Return pools of indices that align with the given dimension and the
+    corresponding max values for each pool.
+
+    See ATen/native/sparse/Softmax.cpp:get_offsets and
+    ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation that this implementation is based on.
+  */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto nnz = indices.size(1);
+  auto offsets = get_offsets(indices, sizes, dim);
+  int64_t* offsets_ptr = offsets.data_ptr<int64_t>();
+
+  auto sorted_indices = at::empty({nnz}, indices.options());
+  thrust_ptr sorted_indices_thrust_ptr(sorted_indices.data_ptr<int64_t>());
+  thrust::sequence(
+      policy, sorted_indices_thrust_ptr, sorted_indices_thrust_ptr + nnz, 0);
+
+  thrust::sort(
+      policy,
+      sorted_indices_thrust_ptr,
+      sorted_indices_thrust_ptr + nnz,
+      [offsets_ptr] __device__(int64_t x, int64_t y) {
+        return offsets_ptr[x] < offsets_ptr[y];
+      });
+  auto pool_sizes = at::empty({nnz}, indices.options());
+
+  auto new_end = thrust::reduce_by_key(
+      policy,
+      sorted_indices_thrust_ptr,
+      sorted_indices_thrust_ptr + nnz,
+      thrust::make_constant_iterator(int64_t(1)),
+      thrust::make_discard_iterator(),
+      thrust_ptr(pool_sizes.data_ptr<int64_t>()),
+      [offsets_ptr] __device__(int64_t x, int64_t y) {
+        return offsets_ptr[x] == offsets_ptr[y];
+      });
+  auto new_sz = thrust::distance(
+      thrust_ptr(pool_sizes.data_ptr<int64_t>()), new_end.second);
+  pool_sizes.resize_({new_sz});
+
+  auto pool_offsets = pool_sizes.clone();
+  thrust_ptr pool_offsets_thrust_ptr(
+      pool_offsets.data_ptr<int64_t>());
+  thrust::exclusive_scan(
+      policy,
+      pool_offsets_thrust_ptr,
+      pool_offsets_thrust_ptr + new_sz,
+      pool_offsets_thrust_ptr);
+
+  Tensor mx_buffer;
+  if (requireMxRows) {
+    
+    auto values_accessor =
+        values.packed_accessor<scalar_t, 2>(); // {nnz, nvalues}
+
+    mx_buffer = at::full({new_sz * nvalues}, Scalar(-std::numeric_limits<scalar_t>::infinity()), values.options());
+ 
+    auto mx_buffer_ptr = mx_buffer.data_ptr<scalar_t>();
+
+    auto pool_sizes_ptr = pool_sizes.data_ptr<int64_t>();
+    auto sorted_indices_ptr = sorted_indices.data_ptr<int64_t>();
+    auto pool_offsets_ptr = pool_offsets.data_ptr<int64_t>();
+
+    thrust::for_each(
+        policy,
+        thrust::make_counting_iterator(int64_t(0)),
+        thrust::make_counting_iterator(int64_t(new_sz)),
+        [values_accessor,
+         sorted_indices_ptr,
+         pool_sizes_ptr,
+         pool_offsets_ptr,
+         mx_buffer_ptr,
+         nvalues] __device__(int64_t index) {
+          int64_t curr_pool_size = pool_sizes_ptr[index];
+          auto mx_row = mx_buffer_ptr + index * nvalues;
+          int64_t offset = pool_offsets_ptr[index];
+          for (int64_t p = 0; p < curr_pool_size; p++) {
+            int64_t i = *(sorted_indices_ptr + offset + p);
+            auto values_row = values_accessor[i].data();
+            for (int64_t j = 0; j < nvalues; j++) {
+              mx_row[j] = c10::cuda::compat::max(mx_row[j], values_row[j]);
+            }
+          }
+        });
+  }
+  return std::make_tuple(
+      sorted_indices, pool_offsets, pool_sizes, mx_buffer);
+}
+
+template <typename scalar_t, bool LogSoftMax>
+void cuda_sparse_coo_softmax(
+    Tensor& output,
+    const Tensor& input,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation of the sparse softmax algorithm that this implementation is
+    based on.
+  */
+  auto sparse_dim = input.sparse_dim();
+  auto indices = input._indices().contiguous();
+  auto values = input._values().contiguous();
+  auto out_values = output._values();
+  auto out_indices = output._indices();
+  out_values.resize_as_(values);
+  out_indices.resize_as_(indices);
+  out_indices.copy_(indices);
+
+  if (dim >= sparse_dim) {
+    if (LogSoftMax) {
+      auto new_values = log_softmax_cuda(values, dim - sparse_dim + 1, false);
+      out_values.set_(new_values);
+    } else {
+      auto new_values = softmax_cuda(values, dim - sparse_dim + 1, false);
+      out_values.set_(new_values);
+    }
+    return;
+  }
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto nnz = values.size(0);
+  auto sizes = input.sizes();
+  auto nvalues = values.numel() / nnz;
+
+  /* Prepare accessors */
+  auto values_2 = values.view({nnz, nvalues});
+  auto values_accessor = values_2.packed_accessor<scalar_t, 2>();
+
+  auto out_values_2 = out_values.view({nnz, nvalues});
+  auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();
+
+  Tensor sorted_indices;
+  Tensor pool_offsets;
+  Tensor pool_sizes;
+  Tensor mx_buffer;
+
+  std::tie(sorted_indices, pool_offsets, pool_sizes, mx_buffer) =
+      compute_pool_max<scalar_t, true>(indices, values_2, sizes, nvalues, dim);
+
+  auto pool_size = pool_offsets.size(0);
+  int block_size = getNumThreads(pool_size);
+  const int grid_size = (pool_size + block_size - 1) / block_size;
+
+  cuda_sparse_coo_softmax_kernel<scalar_t, LogSoftMax>
+      <<<grid_size, block_size, 0, stream>>>(
+          sorted_indices.data_ptr<int64_t>(),
+          pool_size,
+          pool_sizes.data_ptr<int64_t>(),
+          pool_offsets.data_ptr<int64_t>(),
+          nvalues,
+          mx_buffer.data_ptr<scalar_t>(),
+          values_accessor,
+          out_values_accessor);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename scalar_t, bool LogSoftMax>
+void cuda_sparse_coo_softmax_backward(
+    Tensor& grad_input,
+    const Tensor& grad,
+    const Tensor& output,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for
+    the CPU implementation of the sparse softmax backward algorithm that this
+    implementation is based on.
+  */
+  auto sparse_dim = output.sparse_dim();
+  auto sizes = output.sizes().vec();
+  auto grad_indices = grad._indices().contiguous();
+  auto grad_values = grad._values().contiguous();
+  auto out_indices = output._indices().contiguous();
+  auto out_values = output._values().contiguous();
+  auto values = grad_input._values();
+  auto indices = grad_input._indices();
+  auto out_nnz = out_values.size(0);
+  auto grad_nnz = grad_values.size(0);
+
+  values.resize_as_(out_values);
+  values.zero_();
+  indices.resize_as_(out_indices);
+  indices.copy_(out_indices);
+
+  auto out_offsets = get_offsets(out_indices, sizes, -1);
+  auto grad_offsets = get_offsets(grad_indices, sizes, -1);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  /* when dim >= sparse_dim the dense backward is used */ 
+  if (dim >= sparse_dim) {
+    if (at::native::cuda_equal(out_offsets, grad_offsets) == true) {
+      Tensor unused = at::native::empty_like(grad_values);
+      if (LogSoftMax) {
+        auto r = log_softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      } else {
+        auto r = softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      }
+    } else {
+      auto host_out_offsets =
+          out_offsets.to(at::Device(kCPU), indices.dtype(), false, true);
+      auto host_grad_offsets =
+          grad_offsets.to(at::Device(kCPU), indices.dtype(), false, true);
+      auto out_offsets_accessor = host_out_offsets.data_ptr<int64_t>();
+      auto grad_offsets_accessor = host_grad_offsets.data_ptr<int64_t>();
+      for (int64_t i = 0; i < out_nnz; i++) {
+        Tensor unused = at::native::empty_like(grad_values);
+        auto low = thrust::lower_bound(
+            grad_offsets_accessor,
+            grad_offsets_accessor + grad_offsets.size(0),
+            out_offsets_accessor[i]);
+        auto j = low - grad_offsets_accessor;
+        /* 
+          Compute output using dense backward only when limits and pools are valid 
+          If this check is false then a sparse tensor with full of zeros is returned 
+        */ 
+        if (j < grad_nnz && out_offsets_accessor[i] == grad_offsets_accessor[j]) {
+          if (LogSoftMax) {
+            auto r = log_softmax_backward_cuda(
+                grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          } else {
+            auto r = softmax_backward_cuda(
+                grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  auto nnz = values.size(0);
+  auto nvalues = values.numel() / nnz;
+
+  auto values_2 = values.view({nnz, nvalues});
+  auto values_accessor = values_2.packed_accessor<scalar_t, 2>();
+
+  auto out_values_2 = out_values.view({out_nnz, nvalues});
+  auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();
+
+  auto grad_values_2 = grad_values.view({grad_nnz, nvalues});
+  auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();
+
+  Tensor lower_bound_values =
+      at::empty({out_offsets.size(0)}, indices.options());
+
+  thrust::lower_bound(
+      policy,
+      thrust_ptr(grad_offsets.data_ptr<int64_t>()),
+      thrust_ptr(grad_offsets.data_ptr<int64_t>() + grad_offsets.size(0)),
+      thrust_ptr(out_offsets.data_ptr<int64_t>()),
+      thrust_ptr(out_offsets.data_ptr<int64_t>()) + out_offsets.size(0),
+      thrust_ptr(lower_bound_values.data_ptr<int64_t>()));
+
+  Tensor sorted_indices;
+  Tensor pool_offsets;
+  Tensor pool_sizes;
+
+  /* Compute independent pools of indices */
+  std::tie(
+      sorted_indices, pool_offsets, pool_sizes, std::ignore) =
+      compute_pool_max<scalar_t, false>(
+          out_indices, values_2, sizes, nvalues, dim);
+
+  auto pool_size = pool_offsets.size(0);
+
+  int block_size = getNumThreads(pool_size);
+  const int grid_size = (pool_size + block_size - 1) / block_size;
+
+  cuda_sparse_coo_softmax_backward_kernel<scalar_t, LogSoftMax>
+      <<<grid_size, block_size, 0, stream>>>(
+          sorted_indices.data_ptr<int64_t>(),
+          pool_size,
+          pool_sizes.data_ptr<int64_t>(),
+          pool_offsets.data_ptr<int64_t>(),
+          nvalues,
+          grad_nnz,
+          grad_offsets.data_ptr<int64_t>(),
+          out_offsets.data_ptr<int64_t>(),
+          lower_bound_values.data_ptr<int64_t>(),
+          values_accessor,
+          out_values_accessor,
+          grad_values_accessor);
+  THCudaCheck(cudaGetLastError());
+}
+
+} // end anonymous namespace
+
+Tensor softmax_sparse_cuda(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "softmax");
+  if (input.numel() == 0) {
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
+    cuda_sparse_coo_softmax<scalar_t, false>(output, input, dim);
+  });
+  return output;
+}
+
+Tensor log_softmax_sparse_cuda(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "log_softmax");
+  if (input.numel() == 0) {
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
+    cuda_sparse_coo_softmax<scalar_t, true>(output, input, dim);
+  });
+  return output;
+}
+
+Tensor softmax_backward_sparse_cuda(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "softmax_backward");
+  if (output.numel() == 0) {
+    return grad_input;
+  }
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
+    cuda_sparse_coo_softmax_backward<scalar_t, false>(
+        grad_input, grad, output, dim_);
+  });
+  return grad_input;
+}
+
+Tensor log_softmax_backward_sparse_cuda(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "log_softmax_backward");
+  if (output.numel() == 0) {
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
+    cuda_sparse_coo_softmax_backward<scalar_t, true>(
+        grad_input, grad, output, dim_);
+  });
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h
index df9a53f7076d..c2b1775e8f0a 100644
--- a/aten/src/ATen/native/vulkan/Vulkan.h
+++ b/aten/src/ATen/native/vulkan/Vulkan.h
@@ -456,7 +456,7 @@ class ComputeUnit final {
   void createComputePipelineCompile(
       const std::string& glslSrc,
       const VkPipelineCache pipelineCache,
-      const VkDescriptorSetLayout& descrSetLayout,
+      const VkDescriptorSetLayout descrSetLayout,
       const WorkGroupSize workGroupSize);
 #endif
 
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
new file mode 100644
index 000000000000..239edfb74518
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/Runtime.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+//
+// A Vulkan Adapter represents a physical device and its properties.  Adapters
+// are enumerated through the Runtime and are used in creation of Contexts.
+// Each tensor in PyTorch is associated with a Context to make the
+// device <-> tensor affinity explicit.
+//
+
+struct Adapter final {
+  Runtime* runtime;
+  VkPhysicalDevice handle;
+  VkPhysicalDeviceProperties properties;
+  VkPhysicalDeviceMemoryProperties memory_properties;
+  uint32_t compute_queue_family_index;
+
+  inline bool has_unified_memory() const {
+    // Ideally iterate over all memory types to see if there is a pool that
+    // is both host-visible, and device-local.  This should be a good proxy
+    // for now.
+    return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType;
+  }
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h
index afa720a515e6..f0f0c9baa59c 100644
--- a/aten/src/ATen/native/vulkan/api/Allocator.h
+++ b/aten/src/ATen/native/vulkan/api/Allocator.h
@@ -2,11 +2,19 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 
+#ifdef DEBUG
+  #define VMA_DEBUG_LOG(format, ...)  \
+    do {                              \
+      printf(format, ##__VA_ARGS__);  \
+      printf("\n");                   \
+    } while(false)
+#endif /* DEBUG */
+
 #ifdef __clang__
   #pragma clang diagnostic push
   #pragma clang diagnostic ignored "-Wnullability-completeness"
   #pragma clang diagnostic ignored "-Wunused-variable"
-#endif
+#endif /* __clang__ */
 
 // Do NOT include vk_mem_alloc.h directly.
 // Always include this file (Allocator.h) instead.
@@ -15,4 +23,4 @@
 
 #ifdef __clang__
   #pragma clang diagnostic pop
-#endif
+#endif /* __clang__ */
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index 21279b408233..a7793aea16dc 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -5,12 +5,15 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Command::Pool::Factory::Factory(const VkDevice device)
-  : device_(device) {
+Command::Pool::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        device_,
+        "Invalid Vulkan device!");
 }
 
 typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
-  const Descriptor& descriptor) const {
+    const Descriptor& descriptor) const {
   const VkCommandPoolCreateInfo command_pool_create_info{
     VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
     nullptr,
@@ -20,7 +23,14 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
 
   VkCommandPool command_pool{};
   VK_CHECK(vkCreateCommandPool(
-      device_, &command_pool_create_info, nullptr, &command_pool));
+      device_,
+      &command_pool_create_info,
+      nullptr,
+      &command_pool));
+
+  TORCH_CHECK(
+      command_pool,
+      "Invalid Vulkan command pool!");
 
   return Handle{
     command_pool,
@@ -31,8 +41,13 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
 void Command::Pool::purge(
     const VkDevice device,
     const VkCommandPool command_pool) {
-  TORCH_INTERNAL_ASSERT(device, "Invalid Vulkan device!");
-  TORCH_INTERNAL_ASSERT(command_pool, "Invalid Vulkan command pool!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_pool,
+      "Invalid Vulkan command pool!");
 
   VK_CHECK(vkResetCommandPool(device, command_pool, 0u));
 }
@@ -42,6 +57,14 @@ namespace {
 VkCommandBuffer allocate_command_buffer(
     const VkDevice device,
     const VkCommandPool command_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_pool,
+      "Invalid Vulkan command pool!");
+
   const VkCommandBufferAllocateInfo command_buffer_allocate_info{
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
     nullptr,
@@ -52,7 +75,13 @@ VkCommandBuffer allocate_command_buffer(
 
   VkCommandBuffer command_buffer{};
   VK_CHECK(vkAllocateCommandBuffers(
-      device, &command_buffer_allocate_info, &command_buffer));
+      device,
+      &command_buffer_allocate_info,
+      &command_buffer));
+
+  TORCH_CHECK(
+      command_buffer,
+      "Invalid Vulkan command buffer!");
 
   return command_buffer;
 }
@@ -61,6 +90,9 @@ VkCommandBuffer allocate_command_buffer(
 
 Command::Buffer::Buffer(const VkDevice device, const VkCommandPool command_pool)
   : command_buffer_(allocate_command_buffer(device, command_pool)) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_buffer_,
+      "Invalid Vulkan command buffer!");
 }
 
 void Command::Buffer::Buffer::begin() {
@@ -71,7 +103,9 @@ void Command::Buffer::Buffer::begin() {
     nullptr,
   };
 
-  VK_CHECK(vkBeginCommandBuffer(command_buffer_, &command_buffer_begin_info));
+  VK_CHECK(vkBeginCommandBuffer(
+      command_buffer_,
+      &command_buffer_begin_info));
 }
 
 void Command::Buffer::Buffer::end() {
@@ -79,16 +113,26 @@ void Command::Buffer::Buffer::end() {
 }
 
 void Command::Buffer::bind(const VkPipeline pipeline) {
-  TORCH_INTERNAL_ASSERT(pipeline, "Invalid Vulkan pipeline!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline,
+      "Invalid Vulkan pipeline!");
 
-  vkCmdBindPipeline(command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+  vkCmdBindPipeline(
+      command_buffer_,
+      VK_PIPELINE_BIND_POINT_COMPUTE,
+      pipeline);
 }
 
 void Command::Buffer::bind(
     const VkPipelineLayout pipeline_layout,
     const VkDescriptorSet descriptor_set) {
-  TORCH_INTERNAL_ASSERT(pipeline_layout, "Invalid Vulkan pipeline layout!");
-  TORCH_INTERNAL_ASSERT(descriptor_set, "Invalid Vulkan descriptor set!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_set,
+      "Invalid Vulkan descriptor set!");
 
   vkCmdBindDescriptorSets(
       command_buffer_,
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 462a50fef7fd..b0c171faa490 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -9,7 +9,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-struct C10_EXPORT Command final {
+struct Command final {
   //
   // Pool
   //
@@ -29,7 +29,7 @@ struct C10_EXPORT Command final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Pool::Descriptor Descriptor;
       typedef VK_DELETER(CommandPool) Deleter;
@@ -52,8 +52,8 @@ struct C10_EXPORT Command final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Pool(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Pool(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
 
     static void purge(VkDevice device, VkCommandPool command_pool);
@@ -78,8 +78,8 @@ struct C10_EXPORT Command final {
     VkCommandBuffer command_buffer_;
   };
 
-  explicit Command(const VkDevice device)
-    : pool(device) {
+  explicit Command(const GPU& gpu)
+    : pool(gpu) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index 0c1e7cc4720b..cbd53e8045ef 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -24,10 +24,10 @@
     at::native::vulkan::api::destroy_##Handle
 
 #define VK_DELETER_DISPATCHABLE_DECLARE(Handle) \
-    C10_EXPORT void destroy_##Handle(const Vk##Handle handle)
+    void destroy_##Handle(const Vk##Handle handle)
 
 #define VK_DELETER_NON_DISPATCHABLE_DECLARE(Handle)   \
-  class C10_EXPORT destroy_##Handle final {           \
+  class destroy_##Handle final {                      \
    public:                                            \
     explicit destroy_##Handle(const VkDevice device); \
     void operator()(const Vk##Handle handle) const;   \
@@ -40,6 +40,21 @@ namespace native {
 namespace vulkan {
 namespace api {
 
+struct Adapter;
+struct Command;
+class Context;
+struct Descriptor;
+struct Pipeline;
+struct Resource;
+class Runtime;
+struct Shader;
+
+struct GPU final {
+  const Adapter* adapter;
+  VkDevice device;
+  VkQueue queue;
+};
+
 VK_DELETER_DISPATCHABLE_DECLARE(Instance);
 VK_DELETER_DISPATCHABLE_DECLARE(Device);
 VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore);
@@ -78,11 +93,13 @@ class Handle final {
   Handle(const Handle&) = delete;
   Handle& operator=(const Handle&) = delete;
   Handle(Handle&&);
-  Handle& operator=(Handle&&);
+  Handle& operator=(Handle&&) &;
+  Handle& operator=(Handle&&) && = delete;
   ~Handle();
 
   operator bool() const;
-  Type get() const;
+  Type get() const &;
+  Type get() const && = delete;
   Type release();
   void reset(Type payload = kNull);
 
@@ -112,7 +129,7 @@ inline Handle<Type, Deleter>::Handle(Handle&& handle)
 
 template<typename Type, typename Deleter>
 inline Handle<Type, Deleter>&
-Handle<Type, Deleter>::operator=(Handle&& handle)
+Handle<Type, Deleter>::operator=(Handle&& handle) &
 {
   reset(handle.release());
   deleter_ = std::move(handle.deleter_);
@@ -130,7 +147,7 @@ inline Handle<Type, Deleter>::operator bool() const {
 }
 
 template<typename Type, typename Deleter>
-inline Type Handle<Type, Deleter>::get() const {
+inline Type Handle<Type, Deleter>::get() const & {
   return payload_;
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index 76a245e16d38..d0fa08dbde1d 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -8,208 +8,31 @@ namespace vulkan {
 namespace api {
 namespace {
 
-struct Configuration final {
-#ifndef DEBUG
-  static constexpr bool kEnableValidationLayers = false;
-#else
-  static constexpr bool kEnableValidationLayers = true;
-#endif
-};
-
-VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
-    const VkDebugReportFlagsEXT flags,
-    const VkDebugReportObjectTypeEXT /* object_type */,
-    const uint64_t /* object */,
-    const size_t /* location */,
-    const int32_t message_code,
-    const char* const layer_prefix,
-    const char* const message,
-    void* const /* user_data */) {
-  std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
-  const std::string log = stream.str();
-
-  if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
-    LOG(ERROR) << log;
-  } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
-    LOG(WARNING) << log;
-  } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
-    LOG(WARNING) << "Performance:" << log;
-  } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
-    LOG(INFO) << log;
-  } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) {
-    LOG(INFO) << "Debug: " << log;
-  }
-
-  return VK_FALSE;
-}
-
-VkInstance create_instance(const bool enable_validation_layers) {
-  std::vector<const char*> enabled_instance_layers;
-  std::vector<const char*> enabled_instance_extensions;
-
-  if (enable_validation_layers) {
-    uint32_t instance_layers_count = 0;
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count, nullptr));
-
-    std::vector<VkLayerProperties> instance_layer_properties(
-        instance_layers_count);
-
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count,
-        instance_layer_properties.data()));
-
-    constexpr const char* const requested_instance_layers[]{
-        // "VK_LAYER_LUNARG_api_dump",
-        "VK_LAYER_KHRONOS_validation",
-    };
-
-    for (const auto& requested_instance_layer : requested_instance_layers) {
-      for (const auto& layer : instance_layer_properties) {
-        if (strcmp(requested_instance_layer, layer.layerName) == 0) {
-          enabled_instance_layers.push_back(requested_instance_layer);
-          break;
-        }
-      }
-    }
-
-    uint32_t instance_extension_count = 0;
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, nullptr));
-
-    std::vector<VkExtensionProperties> instance_extension_properties(
-        instance_extension_count);
-
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, instance_extension_properties.data()));
-
-    constexpr const char* const requested_instance_extensions[]{
-      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
-    };
+Context* initialize() {
+  static const std::unique_ptr<Context> context([]() -> Context* {
+    try {
+      const Adapter adapter = runtime()->select([](const Adapter& adapter) {
+        // Select the first adapter.
+        return true;
+      });
 
-    for (const auto& requested_instance_extension : requested_instance_extensions) {
-      for (const auto& extension : instance_extension_properties) {
-        if (strcmp(requested_instance_extension, extension.extensionName) == 0) {
-          enabled_instance_extensions.push_back(requested_instance_extension);
-          break;
-        }
-      }
+      return new Context(adapter);
     }
-  }
-
-  constexpr VkApplicationInfo application_info{
-    VK_STRUCTURE_TYPE_APPLICATION_INFO,
-    nullptr,
-    "PyTorch",
-    0,
-    "PyTorch",
-    0,
-    VK_API_VERSION_1_0,
-  };
-
-  const VkInstanceCreateInfo instance_create_info{
-    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
-    nullptr,
-    0u,
-    &application_info,
-    static_cast<uint32_t>(enabled_instance_layers.size()),
-    enabled_instance_layers.data(),
-    static_cast<uint32_t>(enabled_instance_extensions.size()),
-    enabled_instance_extensions.data(),
-  };
-
-  VkInstance instance{};
-  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
-
-  return instance;
-}
-
-VkDebugReportCallbackEXT create_debug_report_callback(
-    const VkInstance instance,
-    const bool enable_validation_layers) {
-  if (!enable_validation_layers) {
-    return VkDebugReportCallbackEXT{};
-  }
-
-  const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
-    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
-    nullptr,
-    VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
-      VK_DEBUG_REPORT_WARNING_BIT_EXT |
-      VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
-      VK_DEBUG_REPORT_ERROR_BIT_EXT |
-      VK_DEBUG_REPORT_DEBUG_BIT_EXT,
-    debug_report_callback_fn,
-    nullptr,
-  };
-
-  const auto vkCreateDebugReportCallbackEXT =
-      (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
-          instance, "vkCreateDebugReportCallbackEXT");
-
-  TORCH_CHECK(
-      vkCreateDebugReportCallbackEXT,
-      "Could not load vkCreateDebugReportCallbackEXT");
-
-  VkDebugReportCallbackEXT debug_report_callback{};
-  VK_CHECK(vkCreateDebugReportCallbackEXT(
-      instance,
-      &debugReportCallbackCreateInfo,
-      nullptr,
-      &debug_report_callback));
-
-  return debug_report_callback;
-}
-
-VkPhysicalDevice acquire_physical_device(const VkInstance instance) {
-  uint32_t device_count = 0;
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
-  TORCH_CHECK(device_count > 0, "Vulkan: Could not find a device with Vulkan support!");
-
-  std::vector<VkPhysicalDevice> devices(device_count);
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
-
-  return devices[0];
-}
-
-VkPhysicalDeviceLimits query_physical_device_physical_device_limits(
-    const VkPhysicalDevice physical_device) {
-  VkPhysicalDeviceProperties physical_device_properties{};
-  vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties);
-  return physical_device_properties.limits;
-}
-
-uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) {
-  uint32_t queue_family_count = 0;
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device, &queue_family_count, nullptr);
-
-  TORCH_CHECK(
-      queue_family_count > 0, "Vulkan: Invalid number of queue families!");
-
-  std::vector<VkQueueFamilyProperties> queue_families_properties(
-    queue_family_count);
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device, &queue_family_count, queue_families_properties.data());
-
-  for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
-    const VkQueueFamilyProperties& properties = queue_families_properties[i];
-    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
-      return i;
+    catch (...) {
+      return nullptr;
     }
-  }
+  }());
 
-  TORCH_CHECK(
-      false,
-      "Vulkan: Could not find a queue family that supports compute operations!");
+  return context.get();
 }
 
 VkDevice create_device(
     const VkPhysicalDevice physical_device,
     const uint32_t compute_queue_family_index) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
   const float queue_priorities = 1.0f;
   const VkDeviceQueueCreateInfo device_queue_create_info{
     VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
@@ -234,6 +57,7 @@ VkDevice create_device(
 
   VkDevice device{};
   VK_CHECK(vkCreateDevice(physical_device, &device_create_info, nullptr, &device));
+  TORCH_CHECK(device, "Invalid Vulkan device!");
 
   return device;
 }
@@ -241,79 +65,45 @@ VkDevice create_device(
 VkQueue acquire_queue(
     const VkDevice device,
     const uint32_t compute_queue_family_index) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   VkQueue queue{};
   vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue);
+  TORCH_CHECK(queue, "Invalid Vulkan queue!");
+
   return queue;
 }
 
 } // namespace
 
-Context::Context(const bool enable_validation_layers)
-    : instance_(create_instance(enable_validation_layers), &VK_DELETER(Instance)),
-      debug_report_callback_(
-          create_debug_report_callback(instance(), enable_validation_layers),
-          Debug(instance())),
-      physical_device_(acquire_physical_device(instance())),
-      physical_device_limits_(query_physical_device_physical_device_limits(physical_device())),
-      compute_queue_family_index_(query_compute_queue_family_index(physical_device())),
-      device_(create_device(physical_device(), compute_queue_family_index_), &VK_DELETER(Device)),
-      queue_(acquire_queue(device(), compute_queue_family_index_)),
-      command_(device()),
-      shader_(device()),
-      pipeline_(device()),
-      descriptor_(device()),
-      resource_(instance(), physical_device(), device()) {
-}
-
-Context::Debug::Debug(const VkInstance instance)
-  : instance_(instance) {
-}
-
-void Context::Debug::operator()(
-    const VkDebugReportCallbackEXT debug_report_callback) const {
-  if (debug_report_callback) {
-    const auto vkDestroyDebugReportCallbackEXT =
-      (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
-          instance_, "vkDestroyDebugReportCallbackEXT");
-
-      TORCH_CHECK(
-        vkDestroyDebugReportCallbackEXT,
-        "Could not load vkDestroyDebugReportCallbackEXT");
-
-      vkDestroyDebugReportCallbackEXT(
-          instance_, debug_report_callback, nullptr);
-  }
-}
-
-Context* initialize() {
-  static const std::unique_ptr<Context> context([]() -> Context* {
-#ifdef USE_VULKAN_WRAPPER
-    if (!InitVulkan()) {
-      TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan");
-      return nullptr;
-    }
-#endif
-
-    try {
-      return new Context(Configuration::kEnableValidationLayers);
-    }
-    catch (...) {
-      return nullptr;
-    }
-  }());
-
-  return context.get();
+void Context::Deleter::operator()(const VkDevice device) const {
+  // No VK_CHECK.  Don't want an exception thrown in the destructor.
+  vkDeviceWaitIdle(device);
+  vkDestroyDevice(device, nullptr);
 }
 
-bool available() {
-  return initialize();
+Context::Context(const Adapter& adapter)
+    : adapter_(adapter),
+      device_(
+          create_device(
+              adapter.handle,
+              adapter.compute_queue_family_index),
+          Deleter{}),
+      queue_(acquire_queue(device(), adapter.compute_queue_family_index)),
+      command_(gpu()),
+      shader_(gpu()),
+      pipeline_(gpu()),
+      descriptor_(gpu()),
+      resource_(gpu()) {
 }
 
-Context& context() {
+Context* context() {
   Context* const context = initialize();
   TORCH_CHECK(context, "Vulkan: Backend not available on this platform!");
 
-  return *context;
+  return context;
 }
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index d57eab66108e..5d593bdd9bc1 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
@@ -14,34 +15,29 @@ namespace api {
 
 //
 // Vulkan Context holds onto all relevant Vulkan state as it pertains to our
-// use of Vulkan in PyTorch.  The context is currently a global object, but
-// technically it does not need to be if we were to make it explicit to the
-// user.
+// use of Vulkan in PyTorch.  A Context is associated with one, and only one,
+// Adapter as a precursor to multi-GPU support.  All Vulkan tensors in PyTorch
+// are associated with a Context to make tensor <-> device affinity explicit.
+// The context is currently a global object, but technically it does not need
+// to be if we were to make it explicit to the user.
 //
 
-class C10_EXPORT Context final {
+class Context final {
  public:
-  explicit Context(bool enable_validation_layers);
+  explicit Context(const Adapter& adapter);
+  Context(const Context&) = delete;
+  Context(Context&&) = default;
+  Context& operator=(const Context&) = delete;
+  Context& operator=(Context&&) = default;
   ~Context() = default;
 
-  inline VkInstance instance() const {
-    return instance_.get();
-  }
-
-  inline VkPhysicalDevice physical_device() const {
-    return physical_device_;
-  }
-
-  inline const VkPhysicalDeviceLimits& physical_device_limits() const {
-    return physical_device_limits_;
-  }
-
-  inline VkDevice device() const {
-    return device_.get();
-  }
-
-  inline VkQueue queue() const {
-    return queue_;
+  inline GPU gpu() {
+    // A GPU is simply a (physical device, logical device, device queue) trio.
+    return {
+      &adapter_,
+      device(),
+      queue(),
+    };
   }
 
   inline Command& command() {
@@ -65,23 +61,26 @@ class C10_EXPORT Context final {
   }
 
  private:
-  class Debug final {
-   public:
-    explicit Debug(VkInstance instance);
-    void operator()(VkDebugReportCallbackEXT debug_report_callback) const;
+  inline VkDevice device() {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_);
+    return device_.get();
+  }
 
-   private:
-    VkInstance instance_;
+  inline VkQueue queue() {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_);
+    return queue_;
+  }
+
+ private:
+  class Deleter final {
+   public:
+    void operator()(VkDevice device) const;
   };
 
  private:
   // Construction and destruction order matters.  Do not move members around.
-  Handle<VkInstance, decltype(&VK_DELETER(Instance))> instance_;
-  Handle<VkDebugReportCallbackEXT, Debug> debug_report_callback_;
-  VkPhysicalDevice physical_device_;
-  VkPhysicalDeviceLimits physical_device_limits_;
-  uint32_t compute_queue_family_index_;
-  Handle<VkDevice, decltype(&VK_DELETER(Device))> device_;
+  Adapter adapter_;
+  Handle<VkDevice, Deleter> device_;
   VkQueue queue_;
   Command command_;
   Shader shader_;
@@ -90,8 +89,7 @@ class C10_EXPORT Context final {
   Resource resource_;
 };
 
-C10_EXPORT bool available();
-C10_EXPORT Context& context();
+Context* context();
 
 } // namespace api
 } // namespace vulkan
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index 1b5ea94341a3..ff0505ccebca 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -44,12 +44,15 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{
   },
 };
 
-Descriptor::Pool::Factory::Factory(const VkDevice device)
-  : device_(device) {
+Descriptor::Pool::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
 }
 
 typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()(
-  const Descriptor& descriptor) const {
+    const Descriptor& descriptor) const {
   const VkDescriptorPoolCreateInfo descriptor_pool_create_info{
     VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
     nullptr,
@@ -61,7 +64,14 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()
 
   VkDescriptorPool descriptor_pool{};
   VK_CHECK(vkCreateDescriptorPool(
-      device_, &descriptor_pool_create_info, nullptr, &descriptor_pool));
+      device_,
+      &descriptor_pool_create_info,
+      nullptr,
+      &descriptor_pool));
+
+  TORCH_CHECK(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
 
   return Handle{
     descriptor_pool,
@@ -72,12 +82,29 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()
 void Descriptor::Pool::purge(
     const VkDevice device,
     const VkDescriptorPool descriptor_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
+
   VK_CHECK(vkResetDescriptorPool(device, descriptor_pool, 0u));
 }
 
-Descriptor::Factory::Factory(const VkDevice device, const VkDescriptorPool descriptor_pool)
+Descriptor::Factory::Factory(
+    const VkDevice device,
+    const VkDescriptorPool descriptor_pool)
   : device_(device),
     descriptor_pool_(descriptor_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
 }
 
 VkDescriptorSet Descriptor::Factory::allocate(
@@ -92,7 +119,13 @@ VkDescriptorSet Descriptor::Factory::allocate(
 
   VkDescriptorSet descriptor_set{};
   VK_CHECK(vkAllocateDescriptorSets(
-      device_, &descriptor_set_allocate_info, &descriptor_set));
+      device_,
+      &descriptor_set_allocate_info,
+      &descriptor_set));
+
+  TORCH_CHECK(
+      descriptor_set,
+      "Invalid Vulkan descriptor set!");
 
   return descriptor_set;
 }
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h
index 3e339ae4641f..bc6c14723990 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@@ -49,7 +49,7 @@ namespace api {
 // as well.  This behavior is by design.
 //
 
-struct C10_EXPORT Descriptor final {
+struct Descriptor final {
   //
   // Pool
   //
@@ -72,7 +72,7 @@ struct C10_EXPORT Descriptor final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Pool::Descriptor Descriptor;
       typedef VK_DELETER(DescriptorPool) Deleter;
@@ -95,8 +95,8 @@ struct C10_EXPORT Descriptor final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Pool(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Pool(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
 
     static void purge(VkDevice device, VkDescriptorPool descriptor_pool);
@@ -118,9 +118,9 @@ struct C10_EXPORT Descriptor final {
     VkDescriptorPool descriptor_pool_;
   } factory;
 
-  explicit Descriptor(const VkDevice device)
-    : pool(device),
-      factory(device, pool.cache.retrieve(Pool::kDefault)) {
+  explicit Descriptor(const GPU& gpu)
+    : pool(gpu),
+      factory(gpu.device, pool.cache.retrieve(Pool::kDefault)) {
   }
 };
 
@@ -156,8 +156,8 @@ inline size_t Descriptor::Pool::Factory::Hasher::operator()(
 } // namespace at
 
 inline bool operator==(
-    const VkDescriptorPoolSize& descriptor_pool_size_1,
-    const VkDescriptorPoolSize& descriptor_pool_size_2) {
-  return (descriptor_pool_size_1.type == descriptor_pool_size_2.type) &&
-         (descriptor_pool_size_1.descriptorCount == descriptor_pool_size_2.descriptorCount);
+    const VkDescriptorPoolSize& _1,
+    const VkDescriptorPoolSize& _2) {
+  return (_1.type == _2.type) &&
+         (_1.descriptorCount == _2.descriptorCount);
 }
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 303eea7cb401..bd9881c05443 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -5,12 +5,19 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Pipeline::Layout::Factory::Factory(const VkDevice device)
- : device_(device) {
+Pipeline::Layout::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
 }
 
 typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()(
     const Descriptor& descriptor) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.descriptor_set_layout,
+      "Invalid Vulkan descriptor set layout!");
+
   const VkPipelineLayoutCreateInfo pipeline_layout_create_info{
     VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
     nullptr,
@@ -23,7 +30,14 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()
 
   VkPipelineLayout pipeline_layout{};
   VK_CHECK(vkCreatePipelineLayout(
-      device_, &pipeline_layout_create_info, nullptr, &pipeline_layout));
+      device_,
+      &pipeline_layout_create_info,
+      nullptr,
+      &pipeline_layout));
+
+  TORCH_CHECK(
+      pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
 
   return Handle{
     pipeline_layout,
@@ -34,6 +48,10 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()
 namespace {
 
 VkPipelineCache create_pipeline_cache(const VkDevice device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   const VkPipelineCacheCreateInfo pipeline_cache_create_info{
     VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
     nullptr,
@@ -44,20 +62,44 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) {
 
   VkPipelineCache pipeline_cache{};
   VK_CHECK(vkCreatePipelineCache(
-      device, &pipeline_cache_create_info, nullptr, &pipeline_cache));
+      device,
+      &pipeline_cache_create_info,
+      nullptr,
+      &pipeline_cache));
+
+  TORCH_CHECK(
+      pipeline_cache,
+      "Invalid Vulkan pipeline cache!");
 
   return pipeline_cache;
 }
 
 } // namespace
 
-Pipeline::Factory::Factory(const VkDevice device)
- : device_(device),
-   pipeline_cache_(create_pipeline_cache(device), VK_DELETER(PipelineCache)(device)) {
+Pipeline::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device),
+   pipeline_cache_(
+      create_pipeline_cache(device_),
+      VK_DELETER(PipelineCache)(device_)) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline_cache_,
+      "Invalid Vulkan pipeline cache!");
 }
 
 typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
     const Descriptor& descriptor) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.shader_module,
+      "Invalid Vulkan shader module!");
+
   constexpr uint32_t x_offset = 0u;
   constexpr uint32_t x_size = sizeof(Shader::WorkGroup::x);
   constexpr uint32_t y_offset = x_offset + x_size;
@@ -113,7 +155,16 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
 
   VkPipeline pipeline{};
   VK_CHECK(vkCreateComputePipelines(
-      device_, pipeline_cache_.get(), 1u, &compute_pipeline_create_info, nullptr, &pipeline));
+      device_,
+      pipeline_cache_.get(),
+      1u,
+      &compute_pipeline_create_info,
+      nullptr,
+      &pipeline));
+
+  TORCH_CHECK(
+      pipeline,
+      "Invalid Vulkan pipeline!");
 
   return Handle{
     pipeline,
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index a5d72324c36e..c327a140eded 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -29,7 +29,7 @@ namespace api {
 // these Vulkan objects.
 //
 
-struct C10_EXPORT Pipeline final {
+struct Pipeline final {
   //
   // Layout
   //
@@ -49,7 +49,7 @@ struct C10_EXPORT Pipeline final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Layout::Descriptor Descriptor;
       typedef VK_DELETER(PipelineLayout) Deleter;
@@ -72,8 +72,8 @@ struct C10_EXPORT Pipeline final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Layout(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Layout(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
   } layout;
 
@@ -93,7 +93,7 @@ struct C10_EXPORT Pipeline final {
 
   class Factory final {
    public:
-    explicit Factory(VkDevice device);
+    explicit Factory(const GPU& gpu);
 
     typedef Pipeline::Descriptor Descriptor;
     typedef VK_DELETER(Pipeline) Deleter;
@@ -117,9 +117,9 @@ struct C10_EXPORT Pipeline final {
   typedef api::Cache<Factory> Cache;
   Cache cache;
 
-  explicit Pipeline(const VkDevice device)
-    : layout(device),
-      cache(Factory(device)) {
+  explicit Pipeline(const GPU& gpu)
+    : layout(gpu),
+      cache(Factory(gpu)) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index c538a1b6e2d0..6969883cb183 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/api/Resource.h>
+#include <ATen/native/vulkan/api/Adapter.h>
 
 namespace at {
 namespace native {
@@ -10,6 +11,18 @@ VmaAllocator create_allocator(
     const VkInstance instance,
     const VkPhysicalDevice physical_device,
     const VkDevice device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      instance,
+      "Invalid Vulkan instance!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   const VmaAllocatorCreateInfo allocator_create_info{
     0u,
     physical_device,
@@ -27,6 +40,7 @@ VmaAllocator create_allocator(
 
   VmaAllocator allocator{};
   VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator));
+  TORCH_CHECK(allocator, "Invalid VMA allocator!");
 
   return allocator;
 }
@@ -46,6 +60,7 @@ VmaAllocationCreateInfo create_allocation_create_info(
 }
 
 void release_buffer(const Resource::Buffer& buffer) {
+  // Safe to pass null as buffer or allocation.
   vmaDestroyBuffer(
       buffer.memory.allocator,
       buffer.handle,
@@ -59,6 +74,7 @@ void release_image(const Resource::Image& image) {
     vkDestroyImageView(allocator_info.device, image.view, nullptr);
   }
 
+  // Safe to pass null as image or allocation.
   vmaDestroyImage(
       image.memory.allocator,
       image.handle,
@@ -87,6 +103,13 @@ Resource::Memory::Scope::Scope(
   : allocator_(allocator),
     allocation_(allocation),
     access_(access) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocator,
+      "Invalid VMA allocator!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocation,
+      "Invalid VMA allocation!");
 }
 
 void Resource::Memory::Scope::operator()(const void* const data) const {
@@ -104,17 +127,20 @@ void Resource::Memory::Scope::operator()(const void* const data) const {
   }
 }
 
-Resource::Pool::Pool(
-    const VkInstance instance,
-    const VkPhysicalDevice physical_device,
-    const VkDevice device)
-  : device_(device),
-    allocator_(create_allocator(instance, physical_device, device), vmaDestroyAllocator) {
+Resource::Pool::Pool(const GPU& gpu)
+  : device_(gpu.device),
+    allocator_(
+        create_allocator(
+          gpu.adapter->runtime->instance(),
+          gpu.adapter->handle,
+          device_),
+        vmaDestroyAllocator) {
     buffers_.reserve(Configuration::kReserve);
     images_.reserve(Configuration::kReserve);
 }
 
-Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) {
+Resource::Buffer Resource::Pool::allocate(
+    const Buffer::Descriptor& descriptor) {
   const VkBufferCreateInfo buffer_create_info{
     VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
     nullptr,
@@ -141,6 +167,9 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor)
       &allocation,
       &allocation_info));
 
+  TORCH_CHECK(buffer, "Invalid Vulkan buffer!");
+  TORCH_CHECK(allocation, "Invalid VMA allocation!");
+
   buffers_.emplace_back(
       Buffer{
         buffer,
@@ -155,7 +184,8 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor)
   return buffers_.back().get();
 }
 
-Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
+Resource::Image Resource::Pool::allocate(
+    const Image::Descriptor& descriptor) {
   const VkImageCreateInfo image_create_info{
     VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
     nullptr,
@@ -189,6 +219,9 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
       &allocation,
       &allocation_info));
 
+  TORCH_CHECK(image, "Invalid Vulkan image!");
+  TORCH_CHECK(allocation, "Invalid VMA allocation!");
+
   const VkImageViewCreateInfo image_view_create_info{
     VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
     nullptr,
@@ -213,7 +246,14 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
 
   VkImageView view{};
   VK_CHECK(vkCreateImageView(
-      device_, &image_view_create_info, nullptr, &view))
+      device_,
+      &image_view_create_info,
+      nullptr,
+      &view));
+
+  TORCH_CHECK(
+      view,
+      "Invalid Vulkan image view!");
 
   images_.emplace_back(
       Image{
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index 04cd9a067663..00145ebe071f 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -8,7 +8,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-struct C10_EXPORT Resource final {
+struct Resource final {
   /*
     Memory
   */
@@ -25,12 +25,25 @@ struct C10_EXPORT Resource final {
     template<
         typename Type,
         typename Pointer = std::add_pointer_t<std::add_const_t<Type>>>
-    Data<Pointer> map() const;
+    Data<Pointer> map() const &;
 
     template<
         typename Type,
         typename Pointer = std::add_pointer_t<Type>>
-    Data<Pointer> map();
+    Data<Pointer> map() &;
+
+   private:
+    // Intentionally disabed to ensure memory access is always properly
+    // encapsualted in a scoped map-unmap region.  Allowing below overloads
+    // to be invoked on a temporary would open the door to the possibility
+    // of accessing the underlying memory out of the expected scope making
+    // for seemingly ineffective memory writes and hard to hunt down bugs.
+
+    template<typename Type, typename Pointer>
+    Data<Pointer> map() const && = delete;
+
+    template<typename Type, typename Pointer>
+    Data<Pointer> map() && = delete;
   };
 
   /*
@@ -95,10 +108,7 @@ struct C10_EXPORT Resource final {
 
   class Pool final {
    public:
-    Pool(
-        VkInstance instance,
-        VkPhysicalDevice physical_device,
-        VkDevice device);
+    explicit Pool(const GPU& gpu);
 
     Buffer allocate(const Buffer::Descriptor& descriptor);
     Image allocate(const Image::Descriptor& descriptor);
@@ -115,11 +125,8 @@ struct C10_EXPORT Resource final {
     std::vector<Handle<Image, void(*)(const Image&)>> images_;
   } pool;
 
-  Resource(
-      const VkInstance instance,
-      const VkPhysicalDevice physical_device,
-      const VkDevice device)
-    : pool(instance, physical_device, device) {
+  explicit Resource(const GPU& gpu)
+    : pool(gpu) {
   }
 };
 
@@ -144,7 +151,7 @@ class Resource::Memory::Scope final {
 };
 
 template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() const {
+inline Resource::Memory::Data<Pointer> Resource::Memory::map() const & {
   void* map(const Memory& memory);
 
   return Data<Pointer>{
@@ -154,7 +161,7 @@ inline Resource::Memory::Data<Pointer> Resource::Memory::map() const {
 }
 
 template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() {
+inline Resource::Memory::Data<Pointer> Resource::Memory::map() & {
   void* map(const Memory& memory);
 
   return Data<Pointer>{
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
new file mode 100644
index 000000000000..ce6e3b4231e4
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -0,0 +1,343 @@
+#include <ATen/native/vulkan/api/Runtime.h>
+#include <ATen/native/vulkan/api/Adapter.h>
+
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+namespace {
+
+struct Configuration final {
+#ifndef DEBUG
+  static constexpr Runtime::Type kRuntime = Runtime::Type::Debug;
+#else
+  static constexpr Runtime::Type kRuntime = Runtime::Type::Release;
+#endif
+};
+
+VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
+    const VkDebugReportFlagsEXT flags,
+    const VkDebugReportObjectTypeEXT /* object_type */,
+    const uint64_t /* object */,
+    const size_t /* location */,
+    const int32_t message_code,
+    const char* const layer_prefix,
+    const char* const message,
+    void* const /* user_data */) {
+  std::stringstream stream;
+  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  const std::string log = stream.str();
+
+  if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
+    LOG(ERROR) << log;
+  } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
+    LOG(WARNING) << log;
+  } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
+    LOG(WARNING) << "Performance:" << log;
+  } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
+    LOG(INFO) << log;
+  } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) {
+    LOG(INFO) << "Debug: " << log;
+  }
+
+  return VK_FALSE;
+}
+
+VkInstance create_instance(const Runtime::Type type) {
+  std::vector<const char*> enabled_instance_layers;
+  std::vector<const char*> enabled_instance_extensions;
+
+  if (Runtime::Type::Debug == type) {
+    uint32_t instance_layers_count = 0;
+    VK_CHECK(vkEnumerateInstanceLayerProperties(
+        &instance_layers_count, nullptr));
+
+    std::vector<VkLayerProperties> instance_layer_properties(
+        instance_layers_count);
+
+    VK_CHECK(vkEnumerateInstanceLayerProperties(
+        &instance_layers_count,
+        instance_layer_properties.data()));
+
+    constexpr const char* const requested_instance_layers[]{
+        // "VK_LAYER_LUNARG_api_dump",
+        "VK_LAYER_KHRONOS_validation",
+    };
+
+    for (const auto& requested_instance_layer : requested_instance_layers) {
+      for (const auto& layer : instance_layer_properties) {
+        if (strcmp(requested_instance_layer, layer.layerName) == 0) {
+          enabled_instance_layers.push_back(requested_instance_layer);
+          break;
+        }
+      }
+    }
+
+    uint32_t instance_extension_count = 0;
+    VK_CHECK(vkEnumerateInstanceExtensionProperties(
+        nullptr, &instance_extension_count, nullptr));
+
+    std::vector<VkExtensionProperties> instance_extension_properties(
+        instance_extension_count);
+
+    VK_CHECK(vkEnumerateInstanceExtensionProperties(
+        nullptr, &instance_extension_count, instance_extension_properties.data()));
+
+    constexpr const char* const requested_instance_extensions[]{
+      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+    };
+
+    for (const auto& requested_instance_extension : requested_instance_extensions) {
+      for (const auto& extension : instance_extension_properties) {
+        if (strcmp(requested_instance_extension, extension.extensionName) == 0) {
+          enabled_instance_extensions.push_back(requested_instance_extension);
+          break;
+        }
+      }
+    }
+  }
+
+  constexpr VkApplicationInfo application_info{
+    VK_STRUCTURE_TYPE_APPLICATION_INFO,
+    nullptr,
+    "PyTorch",
+    0,
+    "PyTorch",
+    0,
+    VK_API_VERSION_1_0,
+  };
+
+  const VkInstanceCreateInfo instance_create_info{
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    nullptr,
+    0u,
+    &application_info,
+    static_cast<uint32_t>(enabled_instance_layers.size()),
+    enabled_instance_layers.data(),
+    static_cast<uint32_t>(enabled_instance_extensions.size()),
+    enabled_instance_extensions.data(),
+  };
+
+  VkInstance instance{};
+  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
+  TORCH_CHECK(instance, "Invalid Vulkan instance!");
+
+  return instance;
+}
+
+VkDebugReportCallbackEXT create_debug_report_callback(
+    const VkInstance instance,
+    const Runtime::Type type) {
+  if (Runtime::Type::Debug != type) {
+    return VkDebugReportCallbackEXT{};
+  }
+
+  const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
+    nullptr,
+    VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
+      VK_DEBUG_REPORT_WARNING_BIT_EXT |
+      VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
+      VK_DEBUG_REPORT_ERROR_BIT_EXT |
+      VK_DEBUG_REPORT_DEBUG_BIT_EXT,
+    debug_report_callback_fn,
+    nullptr,
+  };
+
+  const auto vkCreateDebugReportCallbackEXT =
+      (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
+          instance, "vkCreateDebugReportCallbackEXT");
+
+  TORCH_CHECK(
+      vkCreateDebugReportCallbackEXT,
+      "Could not load vkCreateDebugReportCallbackEXT");
+
+  VkDebugReportCallbackEXT debug_report_callback{};
+  VK_CHECK(vkCreateDebugReportCallbackEXT(
+      instance,
+      &debugReportCallbackCreateInfo,
+      nullptr,
+      &debug_report_callback));
+
+  TORCH_CHECK(
+      debug_report_callback,
+      "Invalid Vulkan debug report callback!");
+
+  return debug_report_callback;
+}
+
+std::vector<VkPhysicalDevice> acquire_physical_devices(
+    const VkInstance instance) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      instance,
+      "Invalid Vulkan instance!");
+
+  uint32_t device_count = 0;
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
+
+  TORCH_CHECK(
+      device_count > 0,
+      "Vulkan: Could not find a device with Vulkan support!");
+
+  std::vector<VkPhysicalDevice> devices(device_count);
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
+
+  return devices;
+}
+
+VkPhysicalDeviceProperties query_physical_device_properties(
+    const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  VkPhysicalDeviceProperties physical_device_properties{};
+  vkGetPhysicalDeviceProperties(
+      physical_device,
+      &physical_device_properties);
+
+  return physical_device_properties;
+}
+
+VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties(
+    const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  VkPhysicalDeviceMemoryProperties physical_device_memory_properties{};
+  vkGetPhysicalDeviceMemoryProperties(
+      physical_device,
+      &physical_device_memory_properties);
+
+  return physical_device_memory_properties;
+}
+
+uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  uint32_t queue_family_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_device, &queue_family_count, nullptr);
+
+  TORCH_CHECK(
+      queue_family_count > 0,
+      "Vulkan: Invalid number of queue families!");
+
+  std::vector<VkQueueFamilyProperties>
+      queue_families_properties(queue_family_count);
+
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_device,
+      &queue_family_count,
+      queue_families_properties.data());
+
+  for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
+    const VkQueueFamilyProperties& properties = queue_families_properties[i];
+    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
+      return i;
+    }
+  }
+
+  TORCH_CHECK(
+      false,
+      "Vulkan: Could not find a queue family that supports compute operations!");
+}
+
+} // namespace
+
+Runtime::Debug::Debug(const VkInstance instance)
+  : instance_(instance) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        instance,
+        "Invalid Vulkan instance!");
+}
+
+void Runtime::Debug::operator()(
+    const VkDebugReportCallbackEXT debug_report_callback) const {
+  if (debug_report_callback) {
+    const auto vkDestroyDebugReportCallbackEXT =
+      (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
+          instance_, "vkDestroyDebugReportCallbackEXT");
+
+      TORCH_CHECK(
+        vkDestroyDebugReportCallbackEXT,
+        "Could not load vkDestroyDebugReportCallbackEXT");
+
+      vkDestroyDebugReportCallbackEXT(
+          instance_, debug_report_callback, nullptr);
+  }
+}
+
+Runtime::Runtime(const Type type)
+    : instance_(create_instance(type), &VK_DELETER(Instance)),
+      debug_report_callback_(
+          create_debug_report_callback(instance(), type),
+          Debug(instance())) {
+}
+
+Adapter Runtime::select(const Selector& selector) {
+  const std::vector<VkPhysicalDevice> physical_devices =
+      acquire_physical_devices(instance());
+
+  for (const VkPhysicalDevice physical_device : physical_devices) {
+    const Adapter adapter{
+      this,
+      physical_device,
+      query_physical_device_properties(physical_device),
+      query_physical_device_memory_properties(physical_device),
+      query_compute_queue_family_index(physical_device),
+    };
+
+    if (selector(adapter)) {
+      return adapter;
+    }
+  }
+
+  TORCH_CHECK(
+      false,
+      "Vulkan: no adapter was selected as part of device enumeration!");
+}
+
+Runtime* initialize() {
+  static const std::unique_ptr<Runtime> runtime([]() -> Runtime* {
+#ifdef USE_VULKAN_WRAPPER
+    if (!InitVulkan()) {
+      TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan!");
+      return nullptr;
+    }
+#endif
+
+    try {
+      return new Runtime(Configuration::kRuntime);
+    }
+    catch (...) {
+      return nullptr;
+    }
+  }());
+
+  return runtime.get();
+}
+
+bool available() {
+  return initialize();
+}
+
+Runtime* runtime() {
+  Runtime* const runtime = initialize();
+  TORCH_CHECK(
+      runtime,
+      "Vulkan: Backend not available on this platform!"
+      "Calls to api::runtime() must have been guarded by api::available().");
+
+  return runtime;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h
new file mode 100644
index 000000000000..766aeb50cabc
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Runtime.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <ATen/native/vulkan/api/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+//
+// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of
+// Vulkan instance initialization from intialization of, and subsequent
+// interactions with,  Vulkan [physical and logical] devices as a precursor to
+// multi-GPU support.  The Vulkan Runtime can be queried for available Adapters
+// (i.e. physical devices) in the system which in turn can be used for creation
+// of a Vulkan Context (i.e. logical devices).  All Vulkan tensors in PyTorch
+// are associated with a Context to make tensor <-> device affinity explicit.
+//
+
+class Runtime final {
+ public:
+  enum class Type {
+    Debug,
+    Release,
+  };
+
+  explicit Runtime(Type type);
+  Runtime(const Runtime&) = delete;
+  Runtime(Runtime&&) = default;
+  Runtime& operator=(const Runtime&) = delete;
+  Runtime& operator=(Runtime&&) = default;
+  ~Runtime() = default;
+
+  inline VkInstance instance() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
+    return instance_.get();
+  }
+
+  typedef std::function<bool (const Adapter&)> Selector;
+  Adapter select(const Selector& selector);
+
+ private:
+  class Debug final {
+   public:
+    explicit Debug(VkInstance instance);
+    void operator()(VkDebugReportCallbackEXT debug_report_callback) const;
+
+   private:
+    VkInstance instance_;
+  };
+
+ private:
+  // Construction and destruction order matters.  Do not move members around.
+  Handle<VkInstance, decltype(&VK_DELETER(Instance))> instance_;
+  Handle<VkDebugReportCallbackEXT, Debug> debug_report_callback_;
+};
+
+bool available();
+Runtime* runtime();
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index bbd3e3464d78..977f915a61d1 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -9,8 +9,12 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Shader::Layout::Factory::Factory(const VkDevice device)
-  : device_(device) {
+
+Shader::Layout::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
 }
 
 Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
@@ -25,7 +29,14 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
 
   VkDescriptorSetLayout descriptor_set_layout{};
   VK_CHECK(vkCreateDescriptorSetLayout(
-      device_, &descriptor_set_layout_create_info, nullptr, &descriptor_set_layout));
+      device_,
+      &descriptor_set_layout_create_info,
+      nullptr,
+      &descriptor_set_layout));
+
+  TORCH_CHECK(
+      descriptor_set_layout,
+      "Invalid Vulkan descriptor set layout!");
 
   return Handle{
     descriptor_set_layout,
@@ -35,6 +46,8 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
 
 Shader::Descriptor::Descriptor(const char* const glsl)
  : type(Type::Source) {
+  TORCH_CHECK(glsl, "Invalid shader source code!");
+
   shader.source = {
     glsl,
     0u,
@@ -43,6 +56,8 @@ Shader::Descriptor::Descriptor(const char* const glsl)
 
 Shader::Descriptor::Descriptor(const uint32_t* const code, const uint32_t size)
  : type(Type::Binary) {
+  TORCH_CHECK(code && (0u != size), "Invalid shader binary!");
+
   shader.binary = {
     code,
     size,
@@ -68,6 +83,10 @@ struct Shader::Factory::Compiler final {
   }
 
   std::vector<uint32_t> compile(const char* const source) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        source,
+        "Invalid shader source code!");
+
     const shaderc::SpvCompilationResult result = context.CompileGlslToSpv(
         source,
         ::strlen(source),
@@ -95,8 +114,8 @@ struct Shader::Factory::Compiler final {
 
 #endif /* USE_VULKAN_SHADERC_RUNTIME */
 
-Shader::Factory::Factory(const VkDevice device)
- : device_(device),
+Shader::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device),
    compiler_(new Compiler) {
 }
 
@@ -139,7 +158,14 @@ typename Shader::Factory::Handle Shader::Factory::operator()(
 
   VkShaderModule shader_module{};
   VK_CHECK(vkCreateShaderModule(
-      device_, &shader_module_create_info, nullptr, &shader_module));
+      device_,
+      &shader_module_create_info,
+      nullptr,
+      &shader_module));
+
+  TORCH_CHECK(
+      shader_module,
+      "Invalid Vulkan shader module!");
 
   return Handle{
     shader_module,
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 0fd2fa01614b..ff02b2ba9064 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -32,7 +32,7 @@ namespace api {
 // and destruct the aforementioned Vulkan objects.
 //
 
-struct C10_EXPORT Shader final {
+struct Shader final {
   //
   // Layout
   //
@@ -52,7 +52,7 @@ struct C10_EXPORT Shader final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Layout::Descriptor Descriptor;
       typedef VK_DELETER(DescriptorSetLayout) Deleter;
@@ -75,8 +75,8 @@ struct C10_EXPORT Shader final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Layout(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Layout(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
   } layout;
 
@@ -122,7 +122,7 @@ struct C10_EXPORT Shader final {
 
   class Factory final {
    public:
-    explicit Factory(VkDevice device);
+    explicit Factory(const GPU& gpu);
     Factory(const Factory&) = delete;
     Factory& operator=(const Factory&) = delete;
     Factory(Factory&&);
@@ -152,9 +152,9 @@ struct C10_EXPORT Shader final {
   typedef api::Cache<Factory> Cache;
   Cache cache;
 
-  explicit Shader(const VkDevice device)
-    : layout(device),
-      cache(Factory(device)) {
+  explicit Shader(const GPU& gpu)
+    : layout(gpu),
+      cache(Factory(gpu)) {
   }
 };
 
@@ -187,11 +187,11 @@ inline size_t Shader::Layout::Factory::Hasher::operator()(
 }
 
 inline bool operator==(
-    const Shader::WorkGroup& work_group_1,
-    const Shader::WorkGroup& work_group_2) {
-  return (work_group_1.x == work_group_2.x) &&
-         (work_group_1.y == work_group_2.y) &&
-         (work_group_1.z == work_group_2.z);
+    const Shader::WorkGroup& _1,
+    const Shader::WorkGroup& _2) {
+  return (_1.x == _2.x) &&
+         (_1.y == _2.y) &&
+         (_1.z == _2.z);
 }
 
 inline bool operator==(
diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h
index 394f55d7d525..658824e3bf2b 100644
--- a/aten/src/ATen/native/vulkan/api/api.h
+++ b/aten/src/ATen/native/vulkan/api/api.h
@@ -2,9 +2,11 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 
+#include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Context.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/Resource.h>
+#include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 1f9225b52770..5ab64d2cb803 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -77,6 +77,18 @@ QTensorImpl* get_qtensorimpl(const Tensor& self) {
   return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
 }
 
+int64_t get_sub_byte_tensor_size(int64_t size_bytes, at::ScalarType t) {
+  int64_t new_size_bytes;
+  switch(t) {
+    case at::ScalarType::QUInt4x2:
+      new_size_bytes = std::ceil(size_bytes * 0.5);
+      break;
+    default:
+      new_size_bytes = size_bytes;
+  }
+  return new_size_bytes;
+}
+
 inline Tensor new_qtensor(
     IntArrayRef sizes,
     const TensorOptions& options,
@@ -99,7 +111,9 @@ inline Tensor new_qtensor(
   TORCH_CHECK(
       isQIntType(typeMetaToScalarType(dtype)),
       "ScalarType is not supported in new_qtensor.");
-  int64_t size_bytes = nelements * dtype.itemsize();
+  auto scalar_type = typeMetaToScalarType(dtype);
+  int64_t size_bytes = get_sub_byte_tensor_size(nelements * dtype.itemsize(), scalar_type);
+
   auto storage = c10::make_intrusive<StorageImpl>(
       StorageImpl::use_byte_size_t(),
       size_bytes,
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 8bf25c3cac2f..26e9fd9f21fa 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -18,9 +18,7 @@ RecordFunctionHandle next_unique_record_function_handle() {
   return RecordFunctionHandle(++unique_rf_id);
 }
 
-// Thread local vector of callbacks, holds pairs (callbacks, unique_id);
-// must be sorted in increasing handles order
-thread_local RecordFunctionCallbacks sorted_tls_callbacks_;
+thread_local RecordFunctionTLS rf_tls_;
 
 std::atomic<int64_t> defaultNodeId(-1);
 
@@ -52,13 +50,21 @@ double sample_zero_one() {
 
 } // namespace
 
+const RecordFunctionTLS& get_record_function_tls_() {
+  return rf_tls_;
+}
+
+void set_record_function_tls_(const RecordFunctionTLS& tls) {
+  rf_tls_ = tls;
+}
+
 class CallbackManager {
  public:
   CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) {
     // note: monotonically increasing callbacks_unique_id keeps
     // sorted_tls_callbacks_ sorted
     auto handle = next_unique_callback_handle();
-    sorted_tls_callbacks_.emplace_back(std::move(cb), handle);
+    rf_tls_.sorted_tls_callbacks_.emplace_back(std::move(cb), handle);
     return handle;
   }
 
@@ -85,7 +91,7 @@ class CallbackManager {
       }
       return false;
     };
-    auto found = find_and_remove(sorted_tls_callbacks_);
+    auto found = find_and_remove(rf_tls_.sorted_tls_callbacks_);
     if (!found) {
       found = find_and_remove(sorted_global_callbacks_);
     }
@@ -99,7 +105,7 @@ class CallbackManager {
   }
 
   void clearThreadLocalCallbacks() {
-    sorted_tls_callbacks_.clear();
+    rf_tls_.sorted_tls_callbacks_.clear();
   }
 
   inline bool hasGlobalCallbacks() const {
@@ -107,7 +113,7 @@ class CallbackManager {
   }
 
   inline bool hasThreadLocalCallbacks() const {
-    return !sorted_tls_callbacks_.empty();
+    return !rf_tls_.sorted_tls_callbacks_.empty();
   }
 
   // init is called by RecordFunction in constructor to
@@ -141,7 +147,7 @@ class CallbackManager {
       ctx_list.resize(num_callbacks);
     };
 
-    init_handles(rec_fn.sorted_active_tls_handles_, sorted_tls_callbacks_, rec_fn.tls_ctx_);
+    init_handles(rec_fn.sorted_active_tls_handles_, rf_tls_.sorted_tls_callbacks_, rec_fn.tls_ctx_);
     init_handles(rec_fn.sorted_active_global_handles_, sorted_global_callbacks_, rec_fn.global_ctx_);
     rec_fn.active = found_active_cb;
     rec_fn.needs_inputs = found_needs_inputs;
@@ -158,7 +164,7 @@ class CallbackManager {
         /* is_start */ true,
         rf);
     mergeRunCallbacks(
-        sorted_tls_callbacks_,
+        rf_tls_.sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
         rf.tls_ctx_,
         /* is_start */ true,
@@ -174,13 +180,16 @@ class CallbackManager {
         /* is_start */ false,
         rf);
     mergeRunCallbacks(
-        sorted_tls_callbacks_,
+        rf_tls_.sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
         rf.tls_ctx_,
         /* is_start */ false,
         rf);
   }
 
+  // Global callbacks; must be sorted in increasing handle order
+  RecordFunctionCallbacks sorted_global_callbacks_;
+
  private:
   bool tryRunCallback(
       const RecordFunctionCallback& rfcb,
@@ -235,9 +244,6 @@ class CallbackManager {
           << "the code after profiler is finished";
     }
   }
-
-  // Global callbacks; must be sorted in increasing handle order
-  RecordFunctionCallbacks sorted_global_callbacks_;
 };
 
 namespace {
@@ -281,15 +287,15 @@ bool RecordFunctionCallback::shouldRun(RecordScope scope) const {
 }
 
 RecordFunctionCallbacks _getTLSCallbacks() {
-  return sorted_tls_callbacks_;
+  return rf_tls_.sorted_tls_callbacks_;
 }
 
 void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks) {
   // keep the original handles
-  sorted_tls_callbacks_ = callbacks;
+  rf_tls_.sorted_tls_callbacks_ = callbacks;
   std::sort(
-      sorted_tls_callbacks_.begin(),
-      sorted_tls_callbacks_.end(),
+      rf_tls_.sorted_tls_callbacks_.begin(),
+      rf_tls_.sorted_tls_callbacks_.end(),
       [](const std::pair<RecordFunctionCallback, CallbackHandle>& l,
           const std::pair<RecordFunctionCallback, CallbackHandle>& r) {
         return l.second < r.second;
@@ -338,16 +344,19 @@ void clearCallbacks() {
 }
 
 bool isRecordFunctionEnabled() {
-  return tls_record_function_enabled_;
+  return rf_tls_.tls_record_function_enabled_;
 }
 
 void enableRecordFunction(bool enable) {
-  tls_record_function_enabled_ = enable;
+  rf_tls_.tls_record_function_enabled_ = enable;
 }
 
 RecordFunction::RecordFunction(RecordScope scope) : scope_(scope) {
-  if (hasCallbacks() && isRecordFunctionEnabled()) {
-    manager().init(*this);
+  auto* rf_tls_ptr = &rf_tls_;
+  auto& m = manager();
+  if (rf_tls_ptr->tls_record_function_enabled_ &&
+      (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty())) {
+    m.init(*this);
   }
 }
 
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 9b4d11ef1d5f..cf839ad4a188 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -13,6 +13,8 @@ namespace at {
 enum class C10_API_ENUM RecordScope : uint8_t {
   // c10/ATen ops, autograd nodes
   FUNCTION = 0,
+  // Functions/nodes called from the autograd
+  BACKWARD_FUNCTION,
   // TorchScript functions, methods
   TORCHSCRIPT_FUNCTION,
   // User defined scope (e.g. with record_function())
@@ -115,10 +117,22 @@ struct TORCH_API RecordFunction {
   // Retrieves the thread_id that this RecordFunction ran start callbacks with.
   // Useful for writing thread safe end callbacks that may be potentially
   // executed in a different thread (async ops)
-  inline uint64_t getStartCallbacksThreadId() const {
+  inline uint64_t threadId() const {
     return thread_id_;
   }
 
+  // For backward functions - thread id of the corresponding forward function,
+  // or zero otherwise;
+  // used alongside with sequence number to correlate backward functions with
+  // the forward ones
+  inline uint64_t forwardThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  inline void setForwardThreadId(uint64_t thread_id) {
+    fwd_thread_id_ = thread_id;
+  }
+
   inline RecordScope scope() const {
     return scope_;
   }
@@ -205,6 +219,9 @@ struct TORCH_API RecordFunction {
   // The logical thread_id that this RecordFunction was created with
   uint64_t thread_id_ = 0;
 
+  // For backward functions - thread id of the the forward function
+  uint64_t fwd_thread_id_ = 0;
+
   // Unique id for this RecordFunction, used in callbacks to track start
   // and end of ranges
   RecordFunctionHandle handle_ {0};
@@ -471,4 +488,16 @@ class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
 TORCH_API RecordFunctionCallbacks _getTLSCallbacks();
 TORCH_API void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks);
 
+struct TORCH_API RecordFunctionTLS {
+  // Thread local vector of callbacks, holds pairs (callbacks, unique_id);
+  // must be sorted in increasing handles order
+  RecordFunctionCallbacks sorted_tls_callbacks_;
+
+  bool tls_record_function_enabled_ = true;
+};
+
+TORCH_API const RecordFunctionTLS& get_record_function_tls_();
+
+TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
+
 } // namespace at
diff --git a/aten/src/ATen/templates/BackendSelectRegister.cpp b/aten/src/ATen/templates/BackendSelectRegister.cpp
index db7276913201..bcbf25f3117f 100644
--- a/aten/src/ATen/templates/BackendSelectRegister.cpp
+++ b/aten/src/ATen/templates/BackendSelectRegister.cpp
@@ -7,7 +7,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <torch/library.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <c10/core/TensorOptions.h>
 
 namespace at {
diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp
deleted file mode 100644
index 72ac3d784dad..000000000000
--- a/aten/src/ATen/templates/PerOpRegistration.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// ${generated_comment}
-
-#include <ATen/Config.h>
-#include <torch/library.h>
-#include <ATen/TypeDefault.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
-$extra_headers
-
-namespace at {
-
-TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) {
-  ${function_registrations}
-}
-
-}  // namespace at
diff --git a/aten/src/ATen/templates/RegistrationDeclarations.h b/aten/src/ATen/templates/RegistrationDeclarations.h
new file mode 100644
index 000000000000..5a0f0d0c7b44
--- /dev/null
+++ b/aten/src/ATen/templates/RegistrationDeclarations.h
@@ -0,0 +1,4 @@
+// This file contains all native_functions that can be registered to
+// and the schema string that they should be registered with
+
+${registration_declarations}
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index aa5bb4f0c838..58c80381d340 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -27,50 +27,45 @@ TORCH_LIBRARY(aten, m) {
   ${function_registrations};
 
   // String Ops
-  // Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
-  m.def("splitlines(str self, bool keepends=False) -> str[]");
-  m.def(
-      "slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str");
-  m.def("isupper(str self) -> bool");
-  m.def("islower(str self) -> bool");
-  m.def("capitalize(str self) -> str");
-  m.def("title(str self) -> str");
-  m.def("center(str self, int width, str fillchar=' ') -> str");
-  m.def("count(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("endswith(str self, str substr, int start=0, int end=-1) -> bool");
-  m.def("startswith(str self, str substr, int start=0, int end=-1) -> bool");
-  m.def("expandtabs(str self, int tabsize=8) -> str");
-  m.def("find(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("rfind(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("index.str(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("rindex(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("isidentifier(str self) -> bool");
-  m.def("istitle(str self) -> bool");
-  m.def("isprintable(str self) -> bool");
-  m.def("ljust(str self, int width, str fillchar=' ') -> str");
-  m.def("rjust(str self, int width, str fillchar=' ') -> str");
-  m.def("zfill(str self, int width) -> str");
-  m.def("lstrip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("rstrip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("strip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("replace(str self, str old, str new, int max=-1) -> str");
-  m.def("partition(str self, str separator) -> (str, str, str)");
-  m.def("rpartition(str self, str separator) -> (str, str, str)");
-  m.def("split.str(str self, str? separator=None, int max=-1) -> str[]");
-  m.def("rsplit(str self, str separator=' ', int max=-1) -> str[]");
-  m.def("join(str self, str[] values) -> str");
-
-  // Integer Ops
-  // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp
-  m.def("Int.Tensor(Tensor a) -> int");
-  m.def("Int.bool(bool a) -> int");
-  m.def("Int.float(float a) -> int");
-  m.def("Int.Scalar(Scalar a) -> int");
-  m.def("Int.str(str a) -> int");
+  // Implementations located in torch/csrc/jit/runtime/register_prim_ops.cpp
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::splitlines(str self, bool keepends=False) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isupper(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::islower(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::capitalize(str self) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::title(str self) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::center(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::count(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::endswith(str self, str substr, int start=0, int end=-1) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::startswith(str self, str substr, int start=0, int end=-1) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::expandtabs(str self, int tabsize=8) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::find(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rfind(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::index.str(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rindex(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isidentifier(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::istitle(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isprintable(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::ljust(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rjust(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::zfill(str self, int width) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::lstrip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rstrip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::strip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::replace(str self, str old, str new, int max=-1) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::partition(str self, str separator) -> (str, str, str)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rpartition(str self, str separator) -> (str, str, str)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::split.str(str self, str? separator=None, int max=-1) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str"));
 
   // Distributed Ops
   // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp
   m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
 }
 
+TORCH_LIBRARY_IMPL(aten, Math, m) {
+  ${math_function_registrations};
+}
+
 }  // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index a0b992302084..9f69c9d6ad6f 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -27,6 +27,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp
@@ -78,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
 list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 list(APPEND ATen_VEC256_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp  
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
   )
 
 # Caffe2 specific tests
diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
index 28a9b0476524..cead52f5a7cc 100644
--- a/aten/src/ATen/test/cpu_caching_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@@ -3,7 +3,7 @@
 #include <ATen/cpu/vec256/vec256.h>
 #include <ATen/ATen.h>
 
-#include <c10/core/CPUCachingAllocator.h>
+#include <c10/mobile/CPUCachingAllocator.h>
 
 TEST(CPUCachingAllocatorTest, check_alloc_free) {
   c10::CPUCachingAllocator caching_allocator;
diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
new file mode 100644
index 000000000000..d3391425e14b
--- /dev/null
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -0,0 +1,167 @@
+#include <gtest/gtest.h>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+#include <ATen/ATen.h>
+
+at::Tensor run_with_control_flow(
+    at::Tensor input,
+    at::Tensor conv_weight,
+    at::Tensor linear_weight,
+    bool cond,
+    std::vector<void*>& pointers,
+    bool record = false,
+    bool validate = false) {
+  if (cond) {
+    input = input * 2;
+  }
+  void* input_ptr = input.data_ptr();
+  auto conv_out = at::conv2d(input, conv_weight);
+  void* conv_out_ptr = input.data_ptr();
+  auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
+  auto output = at::linear(conv_out_flat, linear_weight);
+  if (record) {
+    pointers.push_back(input_ptr);
+    pointers.push_back(conv_out_ptr);
+  }
+  if (validate) {
+    TORCH_CHECK(input_ptr == pointers[0]);
+    TORCH_CHECK(conv_out_ptr == pointers[1]);
+  }
+  return output;
+}
+
+TEST(CPUAllocationPlanTest, with_control_flow) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, true, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode, bool validation_mode) -> bool {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output =
+        run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
+    }
+    bool success{true};
+    for (uint64_t i = 0; i < 10; ++i) {
+      bool validation_success;
+      {
+        c10::WithValidateAllocationPlanGuard
+          validation_guard(&plan, &validation_success);
+        output = run_with_control_flow(
+            a, conv_weight, linear_weight, validation_mode, pointers);
+      }
+      success = success && validation_success;
+    }
+    return success;
+  };
+  ASSERT_FALSE(validate_allocation_plan(false, true));
+  ASSERT_FALSE(validate_allocation_plan(true, false));
+  ASSERT_TRUE(validate_allocation_plan(true, true));
+  ASSERT_TRUE(validate_allocation_plan(false, false));
+}
+
+TEST(CPUAllocationPlanTest, with_profiling_alloc) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, false, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode,
+        bool validation_mode,
+        bool validate_pointers) {
+      pointers.clear();
+      c10::AllocationPlan plan;
+      {
+        c10::WithProfileAllocationsGuard profile_guard(&plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            record_mode,
+            pointers,
+            false,
+            false);
+      }
+      c10::CPUProfilingAllocator profiling_allocator;
+      {
+        c10::WithProfilingAllocatorGuard
+          profiling_allocator_guard(&profiling_allocator, &plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            validation_mode,
+            pointers,
+            validate_pointers,
+            false);
+      }
+      for (uint64_t i = 0; i < 10; ++i) {
+        {
+          c10::WithProfilingAllocatorGuard
+            profiling_allocator_guard(&profiling_allocator, &plan);
+          output = run_with_control_flow(
+              a,
+              conv_weight,
+              linear_weight,
+              validation_mode,
+              pointers,
+              false,
+              validate_pointers);
+        }
+      }
+  };
+  // When control flow conditions are same between profiling and evaluation
+  // profiling allocator should not throw.
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
+  // Furthermore profiling allocator should return the same pointers
+  // back for the intermediate tensors
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, true));
+
+  // When control flow conditions are different between profiling and evaluation
+  // profiling allocator should throw.
+  ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
+  ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp
new file mode 100644
index 000000000000..9a4dfd640c3e
--- /dev/null
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@@ -0,0 +1,40 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+using namespace at;
+
+#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \
+  ASSERT_TRUE(t1.is_same_size(t2));                    \
+  ASSERT_TRUE(t1.allclose(t2, atol, rtol));
+
+// Ideally we want to test both forward and backward on math kernels but I
+// haven't found an easy way to do it.  Currently we only test forward here
+// and rely on backward tests of each at:: function used in math kernels.
+TEST(MathKernelTest, NativeGroupNorm) {
+  int num_channels = 6;
+  int N = 2;
+  int H = 2, W = 2;
+  int HxW = H * W;
+
+  const auto input = randn({N, num_channels, H, W});
+  const auto weight = randn({num_channels});
+  const auto bias = randn({num_channels});
+  double eps = 1e-05;
+  for (bool undef_weight: {true, false}) {
+    for (int num_groups: {3, 6, 1}) {
+      Tensor undef;
+      auto out = at::native::native_group_norm(
+            input, undef_weight ? undef : weight, undef_weight ? undef : bias,
+            N, num_channels, HxW, num_groups, eps);
+      auto math_out = at::native::math_group_norm(
+            input, undef_weight ? undef : weight, undef_weight ? undef : bias,
+            N, num_channels, HxW, num_groups, eps);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<0>(out), std::get<0>(math_out), 1e-4, 1e-6);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<1>(out), std::get<1>(math_out), 1e-4, 1e-6);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<2>(out), std::get<2>(math_out), 1e-4, 1e-6);
+    }
+  }
+}
+
+
diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp
index 0650e9a3e6b4..55df55f3b58c 100644
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
-#include <test/cpp/jit/test_base.h>
+#include <test/cpp/tensorexpr/test_base.h>
 #include <thread>
 
 
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 28c1827485b7..ebf9ffce99d0 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -6,11 +6,6 @@
 
 namespace {
 
-TEST(VulkanAPITest, Context) {
-  constexpr bool kDebug = true;
-  ASSERT_NO_THROW(at::native::vulkan::api::Context{kDebug});
-}
-
 } // namespace
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 6a491991a090..a3ed10126b93 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -65,6 +65,7 @@ install(FILES
   THGenerateComplexTypes.h
   THGenerateIntTypes.h
   THGenerateQUInt8Type.h
+  THGenerateQUInt4x2Type.h
   THGenerateQInt8Type.h
   THGenerateQInt32Type.h
   THGenerateQTypes.h
diff --git a/aten/src/TH/THGenerateQTypes.h b/aten/src/TH/THGenerateQTypes.h
index ee958b3a3210..611b990f508f 100644
--- a/aten/src/TH/THGenerateQTypes.h
+++ b/aten/src/TH/THGenerateQTypes.h
@@ -10,6 +10,7 @@
 #include <TH/THGenerateQUInt8Type.h>
 #include <TH/THGenerateQInt8Type.h>
 #include <TH/THGenerateQInt32Type.h>
+#include <TH/THGenerateQUInt4x2Type.h>
 
 #ifdef THQLocalGenerateManyTypes
 #undef THQLocalGenerateManyTypes
diff --git a/aten/src/TH/THGenerateQUInt4x2Type.h b/aten/src/TH/THGenerateQUInt4x2Type.h
new file mode 100644
index 000000000000..4ecea4514359
--- /dev/null
+++ b/aten/src/TH/THGenerateQUInt4x2Type.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateQUInt4x2Type.h"
+#endif
+
+#define quantized_t c10::quint4x2
+#define scalar_t uint8_t
+#define Real QUInt4x2
+#define RealUnderlying Byte
+#define THQUANTIZED
+#define THQUINT8
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef scalar_t
+#undef quantized_t
+#undef Real
+#undef RealUnderlying
+#undef TH_REAL_IS_BYTE
+#undef THQUINT8
+#undef THQUANTIZED
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/generic/THBlas.cpp b/aten/src/TH/generic/THBlas.cpp
index fd9fe5e6c233..64bc8106fbb3 100644
--- a/aten/src/TH/generic/THBlas.cpp
+++ b/aten/src/TH/generic/THBlas.cpp
@@ -14,8 +14,6 @@ TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
 
 void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy)
 {
@@ -111,51 +109,4 @@ void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y
   }
 }
 
-void THBlas_(ger)(
-  int64_t m,
-  int64_t n,
-  scalar_t alpha,
-  scalar_t *x,
-  int64_t incx,
-  scalar_t *y,
-  int64_t incy,
-  scalar_t *a,
-  int64_t lda)
-{
-  if(n == 1)
-    lda = m;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) &&
-      (incx > 0) && (incx <= INT_MAX) &&
-      (incy > 0) && (incy <= INT_MAX) )
-  {
-    THArgCheck(lda >= THMax(1, m), 9,
-      "lda should be at least max(1, m=%d), but have %d", m, lda);
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#else
-    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#endif
-    return;
-  }
-#endif
-  {
-    int64_t i, j;
-    for(j = 0; j < n; j++)
-    {
-      scalar_t *column_ = a+j*lda;
-      scalar_t z = alpha*y[j*incy];
-      for(i = 0; i < m; i++)
-        column_[i] += z*x[i*incx] ;
-    }
-  }
-}
-
 #endif
diff --git a/aten/src/TH/generic/THBlas.h b/aten/src/TH/generic/THBlas.h
index 4d3facea4d06..a70d99969d31 100644
--- a/aten/src/TH/generic/THBlas.h
+++ b/aten/src/TH/generic/THBlas.h
@@ -7,7 +7,4 @@ TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int
 TH_API void THBlas_(copy)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 TH_API void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 
-/* Level 2 */
-TH_API void THBlas_(ger)(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda);
-
 #endif
diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h
index cd419c695ba5..a41991c469c7 100644
--- a/aten/src/TH/generic/THStorage.h
+++ b/aten/src/TH/generic/THStorage.h
@@ -38,6 +38,7 @@
 #define THQUInt8Storage THStorage
 #define THQInt8Storage THStorage
 #define THQInt32Storage THStorage
+#define THQUInt4x2Storage THStorage
 #define THComplexFloatStorage THStorage
 #define THComplexDoubleStorage THStorage
 
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 764220c24673..6a79f3e14c14 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -216,50 +216,6 @@ static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t nu
   return linearIndex < 0 ? linearIndex + numel : linearIndex;
 }
 
-void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index)
-{
-  THTensor_(resizeNd)(r_, index->dim(), THTensor_getSizePtr(index), NULL);
-  THTensor* dst = THTensor_(newContiguous)(r_);
-
-  index = THLongTensor_newContiguous(index);
-  int64_t* index_data = THLongTensor_data(index);
-  ptrdiff_t srcElements = THTensor_(nElement)(src);
-  scalar_t* src_data = src->data<scalar_t>();
-  scalar_t* dst_data = dst->data<scalar_t>();
-  ptrdiff_t nIndices = THLongTensor_nElement(index);
-  int isContiguous = THTensor_(isContiguous)(src);
-
-  // Exceptions must not be thrown across parallel sections, so we
-  // record the position of the invalid index and throw the exception after the
-  // loop.
-  std::atomic<int64_t> invalidIdxPos(-1);
-
-  at::parallel_for(0, nIndices, TH_OMP_OVERHEAD_THRESHOLD,
-      [&](int64_t start, int64_t end) {
-    for (auto i = start; i < end; i++) {
-      int64_t idx = index_data[i];
-      if (idx < srcElements && idx >= -srcElements) {
-        idx = THTensor_(wrapLinearIndex)(idx, srcElements);
-        if (isContiguous) {
-          dst_data[i] = src_data[idx];
-        } else {
-          dst_data[i] = src_data[THTensor_(dataOffset)(src, idx)];
-        }
-      } else {
-        int64_t tmp = -1;
-        invalidIdxPos.compare_exchange_strong(tmp, i);
-      }
-    }
-  });
-
-  if (invalidIdxPos >= 0) {
-    THTensor_(checkLinearIndex)(index_data[invalidIdxPos], srcElements);
-  }
-
-  THLongTensor_free(index);
-  THTensor_(freeCopyTo)(dst, r_);
-}
-
 void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate)
 {
   THArgCheck(THLongTensor_nElement(index) == THTensor_(nElement)(src), 3,
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index 2450d58a7b57..eb3b593ac736 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -22,76 +22,4 @@
 // sense (rather than just having cut the file down the middle, which is
 // what I did when I split these up originally).
 
-
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
-
-void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha)
-{
-  if( (THTensor_nDimension(vec1) != 1) || (THTensor_nDimension(vec2) != 1) )
-    THError("vector and vector expected, got %dD, %dD tensors",
-        THTensor_nDimension(vec1), THTensor_nDimension(vec2));
-
-  if(t->dim() != 2)
-    THError("expected matrix, got %dD tensor for t", t->dim());
-
-  auto vec1_size = THTensor_(size)(vec1, 0);
-  auto vec2_size = THTensor_(size)(vec2, 0);
-  auto vec1_stride = THTensor_(stride)(vec1, 0);
-  auto vec2_stride = THTensor_(stride)(vec2, 0);
-
-  if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
-    THDescBuff bt  = THTensor_(sizeDesc)(t);
-    THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
-    THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
-    THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
-  }
-
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    at::Tensor r__wrap = THTensor_wrap(r_);
-    at::Tensor t_wrap = THTensor_wrap(t);
-    at::native::copy_(r__wrap, t_wrap);
-  }
-
-  if(beta == 0) {
-    THTensor_wrap(r_).zero_();
-  }
-  else if(beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  // n == 1 || lda >= max(1, m)
-  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
-
-  if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1)))
-  {
-    THBlas_(ger)(vec1_size, vec2_size,
-                 alpha, vec1->data<scalar_t>(), vec1_stride,
-                 vec2->data<scalar_t>(), vec2_stride,
-                 r_->data<scalar_t>(), r_->stride(1));
-  }
-  else if(r_->stride(1) == 1 && LDA_COND(vec2_size, vec1_size, r_->stride(0)))
-  {
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, vec2->data<scalar_t>(), vec2_stride,
-                 vec1->data<scalar_t>(), vec1_stride,
-                 r_->data<scalar_t>(), r_->stride(0));
-  }
-  else
-  {
-    THTensor *cr = THTensor_(newClone)(r_);
-
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, vec2->data<scalar_t>(), vec2_stride,
-                 vec1->data<scalar_t>(), vec1_stride,
-                 cr->data<scalar_t>(), cr->stride(0));
-
-    THTensor_(freeCopyTo)(cr, r_);
-  }
-
-  #undef LDA_COND
-}
-
-#endif /* !defined(TH_REAL_IS_BOOL) */
-
 #endif /* TH_GENERIC_FILE */
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
index 18ccaeb6eb80..1d0daf1206de 100644
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -14,8 +14,6 @@ TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTe
 
 TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
 
-TH_API void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha);
-
 #if !defined(TH_REAL_IS_BOOL)
 TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, scalar_t value);
 #endif
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index fe906ce66fa3..859d904a582b 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -107,30 +107,9 @@ void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, i
 
 }
 
-// Check https://github.com/pytorch/pytorch/issues/22078
-// for information about the bug. We don't know the exact conditions that trigger it,
-// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
-// necessary condition.
-static void checkCuda90Bug(int i_m, int i_n, int i_k)
-{
-#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
-  static std::once_flag alreadyWarned;
-  const int LIMIT = 1 << 21;
-  if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major == 5 || prop->major == 6) {
-      std::call_once(alreadyWarned, []() {
-        TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
-      });
-    }
-  }
-#endif
-}
-
 /* Level 3 */
 void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
 {
-  checkCuda90Bug((int)m, (int)n, (int)k);
   at::cuda::blas::gemm<float>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
@@ -141,23 +120,19 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc)
 {
-  checkCuda90Bug((int)m, (int)n, (int)k);
   at::cuda::blas::gemm<at::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-#ifdef __HIP_PLATFORM_HCC__
 void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc)
 {
   at::cuda::blas::gemm<at::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
-#endif
 
 void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
 {
   at::cuda::blas::gemm<double>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-#if CUDA_VERSION >= 9010  || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB,
                              at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -205,13 +180,12 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
 #endif  // CUDA_VERSION < 11000
 #endif // __HIP_PLATFORM_HCC__
 }
-#endif // CUDA_VERSION or __HIP_PLATFORM_HCC__
 
-#ifdef __HIP_PLATFORM_HCC__
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
                              at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount)
 {
+  at::globalContext().alertCuBLASConfigNotDeterministic();
   if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
 
   {
@@ -219,6 +193,7 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i
             "with the bound [val] <= %d", INT_MAX);
   }
 
+
   adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
@@ -226,15 +201,30 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   float fAlpha = alpha;
   float fBeta = beta;
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  if (prop->major < 8) {
+    TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
+  }
+  THCublasCheck(cublasGemmStridedBatchedEx(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   (void*)&fAlpha, a, CUDA_R_16BF, (int)lda, strideA,
+                                   b, CUDA_R_16BF, (int)ldb, strideB,
+                                   (void*)&fBeta, c, CUDA_R_16BF, (int)ldc, strideC,
+                                   (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+#elif defined(__HIP_PLATFORM_HCC__)
   THCublasCheck(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k,
-                                   (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA,
-                                   b, rocblas_datatype_bf16_r, (int)ldb, strideB,
-                                   (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC,
-                                   c, rocblas_datatype_bf16_r, (int)ldc, strideC,
-                                   (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
-                                   0, 0, NULL, NULL));
+                                  (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA,
+                                  b, rocblas_datatype_bf16_r, (int)ldb, strideB,
+                                  (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC,
+                                  c, rocblas_datatype_bf16_r, (int)ldc, strideC,
+                                  (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
+                                  0, 0, NULL, NULL));
+#else
+  TORCH_CHECK(false, "THCudaBlas_BgemmStridedBatched is only available on CUDA_VERSION >= 11");
+#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 }
-#endif // __HIP_PLATFORM_HCC__
 
 void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
@@ -270,7 +260,6 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t
 #endif
 }
 
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
                              float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -294,7 +283,6 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i
                                    &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
                                    (int)batchCount));
 }
-#endif
 
 void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
@@ -330,7 +318,6 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t
 #endif
 }
 
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
                              double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -353,5 +340,3 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i
                                    &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
                                    (int)batchCount));
 }
-#endif
-
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index cff3180a974a..4078363eb888 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -14,9 +14,8 @@ THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t
 THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
 
 THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
-#ifdef __HIP_PLATFORM_HCC__
+
 THC_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc);
-#endif
 
 THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
@@ -24,25 +23,19 @@ THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb,
 THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
                                      double beta, double *c[], int64_t ldc, int64_t batchCount);
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
                                      float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
                                      double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
-#if CUDA_VERSION >= 9010 || defined(__HIP_PLATFORM_HCC__)
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB,
                                                                   THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
-#ifdef __HIP_PLATFORM_HCC__
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
                                      at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
 #endif
diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh
index 171488d91214..5bd751a4921f 100644
--- a/aten/src/THC/THCDeviceUtils.cuh
+++ b/aten/src/THC/THCDeviceUtils.cuh
@@ -7,6 +7,8 @@
 #include <c10/util/Half.h>
 #endif
 
+#include <c10/util/BFloat16.h>
+
 /* The largest consecutive integer representable in float32 (2^24) */
 #define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f
 
@@ -32,7 +34,7 @@ __host__ __device__ __forceinline__ T THCRoundUp(T a, T b) {
  */
 template <typename T>
 __device__ __forceinline__ T doLdg(const T* p) {
-#if __CUDA_ARCH__ >= 350
+#ifndef __HIP_PLATFORM_HCC__
   return __ldg(p);
 #else
   return *p;
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index 0287f31f658e..dfd3a510e6e1 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -218,20 +218,6 @@ struct WrapIndexOp {
   int64_t size;
 };
 
-template <typename T, typename IndexType, int Dims>
-struct TensorTakeOp {
-  TensorTakeOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
-    : info(info), numel(numel) {}
-
-  __device__ __forceinline__ void operator()(T* out, int64_t* index) {
-    auto offset = indexToOffset<Dims>(info, *index, numel);
-    *out = info.data[offset];
-  }
-
-  const TensorInfo<T, IndexType> info;
-  IndexType numel;
-};
-
 template <typename T, typename IndexType, int Dims>
 struct TensorPutOp {
   TensorPutOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index a6c621c8ef15..07303fa47096 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -220,21 +220,6 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
 #undef LARGE_INDEX
 }
 
-void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLongTensor *index)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
-  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
-
-  THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCudaLongTensor_nDimensionLegacyNoScalars(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(!(THCTensor_(numel)(state, src) == 0 && THCudaLongTensor_numel(state, index) != 0), 2,
-             "tried to take from an empty tensor");
-
-  THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL);
-  dispatchTakePut<scalar_t, TensorTakeOp>(state, src, dst, index);
-}
-
 static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) {
   THCThrustAllocator thrustAlloc(state);
 
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 3158e0e267ed..a5d159a9cace 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -281,7 +281,7 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
 #endif //CUDA_VERSION
 
 #elif defined(THC_REAL_IS_BFLOAT16)
-#if defined(__HIP_PLATFORM_HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
   THCudaBlas_BgemmStridedBatched(
       state,
       transpose_batch1,
@@ -310,15 +310,13 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
     THCTensor_(freeCopyTo)(state, result_, result);
   }
 
-#if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
+#if defined(THC_REAL_IS_BFLOAT16) && !(defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000)
   // To avoid "variable was set but never used" warning
   [&transpose_batch1, &transpose_batch2, &lda, &ldb, &ldc]{}();
   TORCH_CHECK(false, "BgemmStridedBatched is not supported with at::BFloat16 type");
 #endif
   }
-#if !defined(THC_REAL_IS_BFLOAT16) || defined(__HIP_PLATFORM_HCC__)
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
-#endif
 
 #else
   ERROR_ONLY_FP_TYPES("baddbmm");
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index a50f5e8f51ac..357b3f2e22f3 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -9,9 +9,6 @@ void THCTensor_(topk)(THCState* state,
                       THCudaLongTensor *indices,
                       THCTensor *input_,
                       int64_t k, int dim, int dir, int sorted) {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "topk not suppported with BFloat16");
-  #else
   THAssert(topK != NULL && indices != NULL && input_ != NULL);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_));
   dim = at::maybe_wrap_dim(dim, input_);
@@ -186,7 +183,6 @@ void THCTensor_(topk)(THCState* state,
   THCudaLongTensor_free(state, input);
 
   THCudaCheck(cudaGetLastError());
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif // THC_GENERIC_FILE
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index 535c43636af0..44616bf4cf60 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -114,9 +114,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
            int kW, int kH,
            int dW, int dH,
            int padW, int padH) {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateOutput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
   if (bias) {
     THCUNN_assertSameGPU(state, 2, weight, bias);
@@ -267,7 +264,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, weight);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialConvolutionMM_updateGradInput)(
@@ -281,10 +277,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
            int kW, int kH,
            int dW, int dH,
            int padW, int padH) {
-
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateGradInput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
                        gradColumns, gradInput);
   weight = THNN_(newViewWeightMM2d)(state, weight);
@@ -380,7 +372,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialConvolutionMM_accGradParameters)(
@@ -395,10 +386,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
            int dW, int dH,
            int padW, int padH,
            accreal scale_) {
-
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateGradParameters not suppported with BFloat16");
-  #else
   scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
   THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones);
   if (gradWeight) {
@@ -554,7 +541,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 18d8da647d15..53eff031a822 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -13,9 +13,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateOutput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, input, output, weight);
 
   // Only handle 4D Input Tensors for now
@@ -94,7 +91,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, weight);
   if (bias) THCTensor_(free)(state, bias);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
@@ -108,9 +104,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateGradInput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, gradOutput, gradInput, weight);
 
   // Only handle 4D Input Tensors for now
@@ -203,7 +196,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
 
   THCTensor_(free)(state, weight);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
@@ -216,9 +208,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_accGradParameters not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradWeight);
 
   // Only handle 4D Input Tensors for now
@@ -271,7 +260,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
   THCudaCheck(cudaGetLastError());
 
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif
diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
index 620c19a13cf1..5b85f87291dc 100644
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@@ -1,12 +1,12 @@
 import torch
 
 def set_fuser(fuser_name, executor_name):
-    assert fuser_name in ['te', 'old', 'none']
+    assert fuser_name in ['te', 'old', 'none', 'default']
     if fuser_name == 'te':
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(True)
         torch._C._jit_set_bailout_depth(20)
-        torch._C._jit_set_num_profiled_runs(2)
+        torch._C._jit_set_num_profiled_runs(1)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
@@ -21,6 +21,8 @@ def set_fuser(fuser_name, executor_name):
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
+    elif fuser_name == 'default':
+        pass
 
     # --executor overrides settings of --fuser
     if executor_name == 'profiling':
@@ -34,3 +36,5 @@ def set_fuser(fuser_name, executor_name):
     elif executor_name == 'legacy':
         torch._C._jit_set_profiling_executor(False)
         torch._C._jit_set_profiling_mode(False)
+    elif executor_name == 'default':
+        pass
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index 4d927d73bfc0..1c5a905f2b75 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -1,7 +1,7 @@
 import time
 import json
 import torch
-import torch.utils.cpp_extension as cpp_extension # noqa
+import cpp_extension # noqa
 
 
 """PyTorch performance microbenchmarks.
@@ -149,14 +149,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync):
             for _ in range(num_runs):
                 start_time = time.time()
                 self.output = self.op_bench.forward()
-                if cuda_sync: 
+                if cuda_sync:
                     torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
                 self.output = self.op_bench.forward()
-            if cuda_sync: 
+            if cuda_sync:
                 torch.cuda.synchronize(torch.cuda.current_device())
 
     def _output_mean(self):
diff --git a/benchmarks/operator_benchmark/pt/unary_test.py b/benchmarks/operator_benchmark/pt/unary_test.py
index 4a8a7865330b..1391283b1e10 100644
--- a/benchmarks/operator_benchmark/pt/unary_test.py
+++ b/benchmarks/operator_benchmark/pt/unary_test.py
@@ -91,6 +91,7 @@ def forward(self):
         ['sigmoid', torch.sigmoid],
         ['sigmoid_', torch.sigmoid_],
         ['sign', torch.sign],
+        ['sgn', torch.sgn],
         ['sin', torch.sin],
         ['sin_', torch.sin_],
         ['sinh', torch.sinh],
diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 616d1078ee7d..6b187b03522e 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -1,33 +1,23 @@
-from functools import partial
-import itertools
+import argparse
 import statistics
+import sys
 import timeit
 import torch
 
-TENSOR_SIZES = [1, 32, 128, 256, 512]
-INTERNAL_ITER = 256
-PARALLEL_TASKS_NUM = 4
-N = 100
+from torch.utils._benchmark import Timer
 
+PARALLEL_TASKS_NUM = 4
+INTERNAL_ITER = None
 def loop_workload(x):
     for i in range(INTERNAL_ITER):
         x = torch.mm(x, x)
     return x
 
-traced_loop_workload = None
-def run_profiler_benchmark_loop(input_x, use_cuda, profiling_enabled):
-    if profiling_enabled:
-        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
-            traced_loop_workload(input_x)
-    else:
-        traced_loop_workload(input_x)
-
-def parallel_task(x):
-    for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
-        x = torch.mm(x, x)
-    return x
-
 def parallel_workload(x):
+    def parallel_task(x):
+        for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
+            x = torch.mm(x, x)
+        return x
     futs = []
     for i in range(PARALLEL_TASKS_NUM):
         futs.append(torch.jit._fork(parallel_task, x))
@@ -35,50 +25,85 @@ def parallel_workload(x):
         torch.jit._wait(futs[i])
     return x
 
-traced_parallel_workload = None
-def run_profiler_benchmark_parallel(input_x, use_cuda, profiling_enabled):
-    if profiling_enabled:
-        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
-            traced_parallel_workload(input_x)
-    else:
-        traced_parallel_workload(input_x)
 
 if __name__ == '__main__':
-    for workload_name in ["loop", "parallel"]:
-        print("Payload: {}; {} iterations, N = {}\n".format(
-            workload_name, INTERNAL_ITER, N))
-        for params in itertools.product([False, True], TENSOR_SIZES, [False, True]):
-            use_cuda = params[0]
-            profiling_tensor_size = params[1]
-            profiling_enabled = params[2]
-
-            if (use_cuda and not torch.cuda.is_available()):
-                continue
-
-            print("Profiling {}, tensor size {}x{}, use cuda: {}".format(
-                "enabled" if profiling_enabled else "disabled",
-                profiling_tensor_size, profiling_tensor_size, use_cuda))
-
-            input_x = torch.rand(profiling_tensor_size, profiling_tensor_size)
-            if use_cuda:
-                input_x = input_x.cuda()
-            workload = None
-            if workload_name == "loop":
-                workload = partial(
-                    run_profiler_benchmark_loop, input_x, use_cuda, profiling_enabled)
-                traced_loop_workload = torch.jit.trace(loop_workload, input_x)
-            elif workload_name == "parallel":
-                workload = partial(
-                    run_profiler_benchmark_parallel, input_x, use_cuda, profiling_enabled)
-                traced_parallel_workload = torch.jit.trace(
-                    parallel_workload, input_x)
-
-            runtimes = timeit.repeat(workload, repeat=N, number=1)
+    torch._C._set_graph_executor_optimize(False)
+    parser = argparse.ArgumentParser(
+        description='Profiler benchmark')
+
+    parser.add_argument('--with_cuda', action='store_true')
+    parser.add_argument('--with_stack', action='store_true')
+    parser.add_argument('--use_script', action='store_true')
+    parser.add_argument('--profiling_tensor_size', default=1, type=int)
+    parser.add_argument('--workload', default='loop', type=str)
+    parser.add_argument('--internal_iter', default=256, type=int)
+    parser.add_argument('--n', default=100, type=int)
+    parser.add_argument('--use_timer', action='store_true')
+    parser.add_argument('--timer_min_run_time', default=100, type=int)
+
+    args = parser.parse_args()
+
+    if args.with_cuda and not torch.cuda.is_available():
+        print("No CUDA available")
+        sys.exit()
+
+    print("Payload: {}; {} iterations, N = {}\n".format(
+        args.workload, args.internal_iter, args.n))
+    INTERNAL_ITER = args.internal_iter
+
+    for profiling_enabled in [False, True]:
+        print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format(
+            "enabled" if profiling_enabled else "disabled",
+            args.profiling_tensor_size,
+            args.profiling_tensor_size,
+            args.with_cuda,
+            args.with_stack,
+            args.use_script))
+
+        input_x = torch.rand(
+            args.profiling_tensor_size,
+            args.profiling_tensor_size)
+
+        if args.with_cuda:
+            input_x = input_x.cuda()
+
+        workload = None
+        assert args.workload in ["loop", "parallel"]
+        if args.workload == "loop":
+            workload = loop_workload
+        else:
+            workload = parallel_workload
+
+        if args.use_script:
+            traced_workload = torch.jit.trace(workload, (input_x,))
+            workload = traced_workload
+
+        if profiling_enabled:
+            def payload():
+                x = None
+                with torch.autograd.profiler.profile(
+                        use_cuda=args.with_cuda,
+                        with_stack=args.with_stack) as prof:
+                    x = workload(input_x)
+                return x
+        else:
+            def payload():
+                return workload(input_x)
+
+        if args.use_timer:
+            t = Timer(
+                "payload()",
+                globals={"payload": payload},
+                timer=timeit.default_timer,
+            ).blocked_autorange(min_run_time=args.timer_min_run_time)
+            print(t)
+        else:
+            runtimes = timeit.repeat(payload, repeat=args.n, number=1)
             avg_time = statistics.mean(runtimes) * 1000.0
             stddev_time = statistics.stdev(runtimes) * 1000.0
             print("\tavg. time: {:.3f} ms, stddev: {:.3f} ms".format(
                 avg_time, stddev_time))
-            if workload_name == "loop":
+            if args.workload == "loop":
                 print("\ttime per iteration: {:.3f} ms".format(
-                    avg_time / INTERNAL_ITER))
-            print()
+                    avg_time / args.internal_iter))
+        print()
diff --git a/benchmarks/record_function_benchmark/record_function_bench.py b/benchmarks/record_function_benchmark/record_function_bench.py
index ddd8243ebf0a..830328247bb5 100644
--- a/benchmarks/record_function_benchmark/record_function_bench.py
+++ b/benchmarks/record_function_benchmark/record_function_bench.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 import torch
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 try:
diff --git a/benchmarks/static_runtime/CMakeLists.txt b/benchmarks/static_runtime/CMakeLists.txt
index 6191150dc61b..0a263c2a5a91 100644
--- a/benchmarks/static_runtime/CMakeLists.txt
+++ b/benchmarks/static_runtime/CMakeLists.txt
@@ -1,3 +1,7 @@
-list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
 list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
+list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
 set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE)
+
+list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
+list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc)
+set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE)
diff --git a/benchmarks/static_runtime/deep_wide_pt_bench.cc b/benchmarks/static_runtime/deep_wide_pt_bench.cc
index ef960d28d7eb..21c2923f8301 100644
--- a/benchmarks/static_runtime/deep_wide_pt_bench.cc
+++ b/benchmarks/static_runtime/deep_wide_pt_bench.cc
@@ -60,7 +60,8 @@ static void BM_deep_wide_jit_profiling_executor(benchmark::State& state) {
 
 static void BM_deep_wide_static(benchmark::State& state) {
   auto mod = getDeepAndWideSciptModel();
-  torch::jit::StaticRuntime runtime(mod);
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
 
   const int batch_size = state.range(0);
   auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
@@ -75,6 +76,28 @@ static void BM_deep_wide_static(benchmark::State& state) {
   }
 }
 
+const std::shared_ptr<torch::jit::Graph>& getStaticGraph() {
+  static const std::shared_ptr<torch::jit::Graph> g =
+      torch::jit::PrepareForStaticRuntime(getDeepAndWideSciptModel());
+  return g;
+}
+
+static void BM_deep_wide_static_threaded(benchmark::State& state) {
+  auto g = getStaticGraph();
+  torch::jit::StaticRuntime runtime(g);
+
+  const int batch_size = 1; // state.range(0);
+  auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+  auto user_emb = torch::randn({batch_size, 1, embedding_size});
+  auto wide = torch::randn({batch_size, num_features});
+
+  std::vector<at::Tensor> inputs({ad_emb_packed, user_emb, wide});
+
+  for (auto _ : state) {
+    runtime.run(inputs);
+  }
+}
+
 BENCHMARK(BM_deep_wide_base)->RangeMultiplier(8)->Ranges({{1, 20}});
 
 BENCHMARK(BM_deep_wide_jit_graph_executor)
@@ -86,5 +109,6 @@ BENCHMARK(BM_deep_wide_jit_profiling_executor)
     ->Ranges({{1, 20}});
 
 BENCHMARK(BM_deep_wide_static)->RangeMultiplier(8)->Ranges({{1, 20}});
+BENCHMARK(BM_deep_wide_static_threaded)->Threads(8);
 
 BENCHMARK_MAIN();
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 3ad0956ced73..172073705ea1 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -14,7 +14,85 @@ TEST(StaticRuntime, TrivialModel) {
 
   // run static runtime
   std::vector<at::Tensor> input_tensors({a, b, c});
-  torch::jit::StaticRuntime runtime(mod);
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
   at::Tensor output_2 = runtime.run(input_tensors)[0];
   EXPECT_TRUE(output_1.equal(output_2));
 }
+
+TEST(StaticRuntime, DeepWide) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  torch::jit::Module mod = getDeepAndWideSciptModel();
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> inputs({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = mod.forward(inputs).toTensor();
+
+      // run static runtime
+      std::vector<at::Tensor> input_tensors({ad_emb_packed, user_emb, wide});
+      at::Tensor output_2 = runtime.run(input_tensors)[0];
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
+
+TEST(StaticRuntime, KWargsAPI_1) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  auto module = getDeepAndWideSciptModel();
+  torch::jit::StaticRuntime runtime(module);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> inputs({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = module.forward(inputs).toTensor();
+
+      // run static runtime
+      at::Tensor output_2 = runtime.run(inputs, {}).toTensor();
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
+
+TEST(StaticRuntime, KWargsAPI_2) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  auto module = getDeepAndWideSciptModel();
+  auto g = torch::jit::PrepareForStaticRuntime(module);
+  torch::jit::StaticRuntime runtime(module);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> args({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = module.forward(args).toTensor();
+
+      std::unordered_map<std::string, c10::IValue> kwargs(
+          {{"ad_emb_packed", ad_emb_packed},
+           {"user_emb", user_emb},
+           {"wide", wide}});
+
+      // run static runtime
+      at::Tensor output_2 = runtime.run({}, kwargs).toTensor();
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc
index a7e3383b97f4..d924003b9270 100644
--- a/binaries/record_function_benchmark.cc
+++ b/binaries/record_function_benchmark.cc
@@ -9,8 +9,8 @@
 
 C10_DEFINE_int(iter, 100, "Number of iterations");
 C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations");
-C10_DEFINE_int(rec_fn_iter, 10e6,
-    "Number of iterations for the pure RecordFunction benchmark");
+C10_DEFINE_int(sampled_iter, 10e6,
+    "Number of iterations for the sampled observer benchmark");
 
 namespace {
 const int kInnerIter = 100;
@@ -23,6 +23,8 @@ const float kLowSamplingProb = 0.0001;
 }
 
 void setupBenchmarkCallbacks() {
+  at::enableRecordFunction();
+  at::clearCallbacks();
   // non-sampled callback
   at::addGlobalCallback(at::RecordFunctionCallback(
       [&](const at::RecordFunction& fn) {},
@@ -40,7 +42,7 @@ void setupBenchmarkCallbacks() {
   }
 }
 
-float runBench(int tensor_size, int outer_iter) {
+float runTensorBench(int tensor_size, int outer_iter) {
   typedef std::chrono::high_resolution_clock clock;
   typedef std::chrono::microseconds us;
   std::chrono::time_point<clock> start_time = clock::now();
@@ -54,30 +56,53 @@ float runBench(int tensor_size, int outer_iter) {
   return duration;
 }
 
-int main(int argc, char** argv) {
-  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
-    std::cout << "Failed to parse command line flags" << std::endl;
-    return -1;
+float runPureRecordFunctionBench(int outer_iter) {
+  typedef std::chrono::high_resolution_clock clock;
+  typedef std::chrono::microseconds us;
+  std::chrono::time_point<clock> start_time = clock::now();
+  for (auto n = 0; n < outer_iter; ++n) {
+    RECORD_USER_SCOPE("test");
   }
+  auto duration = static_cast<float>(
+      std::chrono::duration_cast<us>(clock::now() - start_time).count());
+  return duration;
+}
 
-  at::enableRecordFunction();
-  setupBenchmarkCallbacks();
-
-  auto duration = runBench(kSmallTensorSize, FLAGS_warmup_iter);
-  std::cout << "Warmup time: " << duration << " us." << std::endl;
-
+void runBenchmark() {
+  float duration = 0;
   for (auto tensor_size : std::set<int>({kSmallTensorSize, kTensorSize})) {
-    duration = runBench(tensor_size, FLAGS_iter);
-    std::cout << "Time per iteration ("
+    duration = runTensorBench(tensor_size, FLAGS_iter);
+    std::cout << "Running tensor benchmark, time per iteration ("
               << tensor_size
               << "x"
               << tensor_size
               << "): " << (duration/FLAGS_iter)
               << " us." << std::endl;
   }
+  duration = runPureRecordFunctionBench(FLAGS_iter * 100);
+  std::cout << "Running pure RecordFunction benchmark, time per iteration: "
+              << (duration/FLAGS_iter)
+              << " us." << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
+    std::cout << "Failed to parse command line flags" << std::endl;
+    return -1;
+  }
+
+  auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter);
+  std::cout << "Warmup time: " << duration << " us." << std::endl;
+
+  setupBenchmarkCallbacks();
+  std::cout << "Running with empty observers" << std::endl;
+  runBenchmark();
 
   at::clearCallbacks();
+  std::cout << "Running without observers" << std::endl;
+  runBenchmark();
 
+  std::cout << "Running sampled observer benchmark" << std::endl;
   int cb_count = 0;
   at::addGlobalCallback(at::RecordFunctionCallback(
       [&](const at::RecordFunction& fn) {
@@ -88,18 +113,12 @@ int main(int argc, char** argv) {
     .samplingProb(kLowSamplingProb)
   );
 
-  typedef std::chrono::high_resolution_clock clock;
-  typedef std::chrono::microseconds us;
-  std::chrono::time_point<clock> start_time = clock::now();
-  for (auto n = 0; n < FLAGS_rec_fn_iter; ++n) {
-    RECORD_USER_SCOPE("test");
-  }
-  duration = static_cast<float>(
-      std::chrono::duration_cast<us>(clock::now() - start_time).count());
-  std::cout << "Pure RecordFunction runtime of " << FLAGS_rec_fn_iter
+  runPureRecordFunctionBench(FLAGS_sampled_iter);
+
+  std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter
             << " iterations " << duration
             << " us, number of callback invocations: " << cb_count
-            << ", expected number: ~" << (int)(FLAGS_rec_fn_iter * kLowSamplingProb)
+            << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb)
             << " invocations" << std::endl;
 
   at::clearCallbacks();
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index db78467cfb43..09f1cabb8e15 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -24,7 +24,7 @@
 #include "torch/csrc/jit/serialization/import.h"
 #include "torch/script.h"
 
-#include "c10/core/CPUCachingAllocator.h"
+#include "c10/mobile/CPUCachingAllocator.h"
 
 #include <chrono>
 using namespace std::chrono;
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 17fd7e680122..48bceb440954 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -17,6 +17,7 @@ set(C10_USE_GFLAGS ${USE_GFLAGS}) # used in cmake_macros.h.in
 set(C10_USE_GLOG ${USE_GLOG}) # used in cmake_macros.h.in
 set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
 set(C10_USE_NUMA ${USE_NUMA})
+set(C10_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
 configure_file(
     ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in
     ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
@@ -32,6 +33,7 @@ file(GLOB C10_SRCS
         core/dispatch/*.cpp
         core/op_registration/*.cpp
         core/impl/*.cpp
+        mobile/*.cpp
         macros/*.cpp
         util/*.cpp
         )
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index e830aa4832d0..c76fefe21d27 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/CPUAllocator.h>
-#include <c10/core/CPUCachingAllocator.h>
 #include <c10/core/DeviceType.h>
+#include <c10/mobile/CPUCachingAllocator.h>
+#include <c10/mobile/CPUProfilingAllocator.h>
 
 // TODO: rename flags to C10
 C10_DEFINE_bool(
@@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       allocator_ptr->free(pointer);
+    } else if (profiling_allocator_ptr != nullptr) {
+      profiling_allocator_ptr->free(pointer);
     } else {
       c10::free_cpu(pointer);
       // This adds extra cost to freeing memory to the default case when
       // caching allocator is not enabled.
       CPUCachingAllocator::record_free(pointer);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_free(pointer);
+      }
     }
   }
 
@@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
     void* data;
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       data = allocator_ptr->allocate(alloc_size);
+    } else if (profiling_allocator_ptr != nullptr) {
+      data = profiling_allocator_ptr->allocate(alloc_size);
     } else {
       data = c10::alloc_cpu(alloc_size);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_allocation(alloc_size, data);
+      }
     }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index daae181db9d7..c4f420ab6e22 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -3,11 +3,13 @@
 
 namespace c10 {
 static auto default_dtype = caffe2::TypeMeta::Make<float>();
+static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
   default_dtype = std::move(dtype);
-  if(dtype == caffe2::TypeMeta::Make<double>()) {
+  default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+  if(default_dtype_as_scalartype == ScalarType::Double) {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
   } else {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<float>>());
@@ -17,6 +19,9 @@ void set_default_dtype(caffe2::TypeMeta dtype) {
 const caffe2::TypeMeta& get_default_dtype() {
   return default_dtype;
 }
+ScalarType get_default_dtype_as_scalartype() {
+  return default_dtype_as_scalartype;
+}
 const caffe2::TypeMeta& get_default_complex_dtype() {
   return default_complex_dtype;
 }
diff --git a/c10/core/DefaultDtype.h b/c10/core/DefaultDtype.h
index 402a6069bfc3..eda34b217727 100644
--- a/c10/core/DefaultDtype.h
+++ b/c10/core/DefaultDtype.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
 
 namespace caffe2 {
 class TypeMeta;
@@ -9,5 +10,6 @@ class TypeMeta;
 namespace c10 {
 C10_API void set_default_dtype(caffe2::TypeMeta dtype);
 C10_API const caffe2::TypeMeta& get_default_dtype();
+C10_API ScalarType get_default_dtype_as_scalartype();
 C10_API const caffe2::TypeMeta& get_default_complex_dtype();
 } // namespace c10
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 41980540017c..8f2acebd84f0 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -38,7 +38,8 @@ namespace c10 {
   _(c10::qint8, QInt8) /* 12 */                          \
   _(c10::quint8, QUInt8) /* 13 */                        \
   _(c10::qint32, QInt32) /* 14 */                        \
-  _(at::BFloat16, BFloat16) /* 15 */
+  _(at::BFloat16, BFloat16) /* 15 */                     \
+  _(c10::quint4x2, QUInt4x2) /* 16 */
 
 
 // If you want to support ComplexHalf for real, add ComplexHalf
@@ -154,7 +155,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 #define AT_FORALL_QINT_TYPES(_)  \
   _(c10::qint8, QInt8)           \
   _(c10::quint8, QUInt8)         \
-  _(c10::qint32, QInt32)
+  _(c10::qint32, QInt32)         \
+  _(c10::quint4x2, QUInt4x2)
 
 #define AT_FORALL_COMPLEX_TYPES(_)             \
   _(c10::complex<float>, ComplexFloat)         \
@@ -279,7 +281,7 @@ static inline bool isComplexType(ScalarType t) {
 
 static inline bool isQIntType(ScalarType t) {
   // Don't forget to extend this when adding new QInt types
-  return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32;
+  return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32 || t == ScalarType::QUInt4x2;
 }
 
 static inline ScalarType toQIntType(ScalarType t) {
@@ -303,6 +305,8 @@ static inline ScalarType toUnderlying(ScalarType t) {
       return ScalarType::Char;
     case ScalarType::QInt32:
       return ScalarType::Int;
+    case ScalarType::QUInt4x2:
+      return ScalarType::Byte;
     default:
       return t;
   }
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index a42f4d4284f4..dd92f919662f 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -17,6 +17,29 @@
 #include <utility>
 
 namespace c10 {
+
+DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device);
+
+inline ScalarType dtype_or_default(c10::optional<ScalarType> dtype) {
+  return dtype.has_value() ? *dtype : get_default_dtype_as_scalartype();
+}
+
+inline caffe2::TypeMeta dtype_or_default(c10::optional<caffe2::TypeMeta> dtype) {
+  return dtype.has_value() ? *dtype : get_default_dtype();
+}
+
+inline Layout layout_or_default(c10::optional<Layout> layout) {
+  return layout.has_value() ? *layout : kStrided;
+}
+
+inline Device device_or_default(c10::optional<Device> device) {
+  return device.has_value() ? *device : Device(kCPU);
+}
+
+inline bool pinned_memory_or_default(c10::optional<bool> pinned_memory) {
+  return pinned_memory.has_value() ? *pinned_memory : false;
+}
+
 /// A class to encapsulate construction axes of an Tensor.  TensorOptions was
 /// designed to support the Python style API for specifying construction options
 /// on factory functions, e.g.,
@@ -228,7 +251,7 @@ struct C10_API TensorOptions {
 
   /// Returns the device of the `TensorOptions`.
   Device device() const noexcept {
-    return has_device_ ? device_ : Device(kCPU);
+    return device_or_default(device_opt());
   }
 
   /// Returns whether the device is specified.
@@ -249,7 +272,7 @@ struct C10_API TensorOptions {
 
   /// Returns the dtype of the `TensorOptions`.
   caffe2::TypeMeta dtype() const noexcept {
-    return has_dtype_ ? dtype_ : get_default_dtype();
+    return dtype_or_default(dtype_opt());
   }
 
   /// Returns whether the dtype is specified.
@@ -265,7 +288,7 @@ struct C10_API TensorOptions {
 
   /// Returns the layout of the `TensorOptions`.
   Layout layout() const noexcept {
-    return has_layout_ ? layout_ : kStrided;
+    return layout_or_default(layout_opt());
   }
 
   /// Returns whether the layout is specified.
@@ -298,7 +321,7 @@ struct C10_API TensorOptions {
 
   /// Returns the `pinned_memory` property of the `TensorOptions`.
   bool pinned_memory() const noexcept {
-    return has_pinned_memory_ ? pinned_memory_ : false;
+    return pinned_memory_or_default(pinned_memory_opt());
   }
 
   /// Returns whether the `pinned_memory` is specified.
@@ -370,65 +393,7 @@ struct C10_API TensorOptions {
   }
 
   inline DispatchKey computeDispatchKey() const {
-    switch (layout()) {
-      case Layout::Strided:
-        switch (device().type()) {
-          case DeviceType::CPU: {
-            auto dtype_tmp = typeMetaToScalarType(dtype());
-            if (isQIntType(dtype_tmp)) {
-              return DispatchKey::QuantizedCPU;
-            }
-            return DispatchKey::CPU;
-            }
-            case DeviceType::CUDA: {
-              auto dtype_tmp = typeMetaToScalarType(dtype());
-              if (isQIntType(dtype_tmp)) {
-                return DispatchKey::QuantizedCUDA;
-              }
-              return DispatchKey::CUDA;
-            }
-          case DeviceType::MKLDNN:
-            return DispatchKey::MKLDNN;
-          case DeviceType::OPENGL:
-            return DispatchKey::OpenGL;
-          case DeviceType::OPENCL:
-            return DispatchKey::OpenCL;
-          case DeviceType::IDEEP:
-            return DispatchKey::IDEEP;
-          case DeviceType::HIP:
-            return DispatchKey::HIP;
-          case DeviceType::FPGA:
-            return DispatchKey::FPGA;
-          case DeviceType::MSNPU:
-            return DispatchKey::MSNPU;
-          case DeviceType::XLA:
-            return DispatchKey::XLA;
-          case DeviceType::Vulkan:
-            return DispatchKey::Vulkan;
-          default:
-            AT_ERROR("Unsupported device type for dense layout: ", device().type());
-        }
-      case Layout::Sparse:
-        switch (device().type()) {
-          case DeviceType::CPU:
-            return DispatchKey::SparseCPU;
-          case DeviceType::CUDA:
-            return DispatchKey::SparseCUDA;
-          case DeviceType::HIP:
-            return DispatchKey::SparseHIP;
-          default:
-            AT_ERROR("Unsupported device type for sparse layout: ", device().type());
-        }
-      case Layout::Mkldnn:
-        switch (device().type()) {
-          case DeviceType::CPU:
-            return DispatchKey::MkldnnCPU;
-          default:
-            AT_ERROR("Unsupported device type for mkldnn layout: ", device().type());
-        }
-      default:
-        AT_ERROR("Unsupported layout: ", layout());
-    }
+    return c10::computeDispatchKey(optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt());
   }
 
  private:
@@ -611,13 +576,68 @@ inline std::string toString(const TensorOptions options) {
 
 // This is intended to be a centralized location by which we can determine
 // what an appropriate DispatchKey for a tensor is.
-//
-// This takes a TensorOptions, rather than just a DeviceType and Layout, because
-// we reserve the right to change dispatch based on *any* aspect of
-// TensorOptions.  WARNING: If you do this, you need to fix the calls
-// to computeDispatchKey in caffe2/tensor.h
-inline DispatchKey computeDispatchKey(TensorOptions options) {
-  return options.computeDispatchKey();
+inline DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device) {
+  const auto layout_ = layout_or_default(layout);
+  const auto device_ = device_or_default(device);
+  switch (layout_) {
+      case Layout::Strided: {
+        const auto dtype_ = dtype_or_default(dtype);
+        switch (device_.type()) {
+          case DeviceType::CPU: {
+            if (isQIntType(dtype_)) {
+              return DispatchKey::QuantizedCPU;
+            }
+            return DispatchKey::CPU;
+          }
+          case DeviceType::CUDA: {
+            if (isQIntType(dtype_)) {
+              return DispatchKey::QuantizedCUDA;
+            }
+            return DispatchKey::CUDA;
+          }
+          case DeviceType::MKLDNN:
+            return DispatchKey::MKLDNN;
+          case DeviceType::OPENGL:
+            return DispatchKey::OpenGL;
+          case DeviceType::OPENCL:
+            return DispatchKey::OpenCL;
+          case DeviceType::IDEEP:
+            return DispatchKey::IDEEP;
+          case DeviceType::HIP:
+            return DispatchKey::HIP;
+          case DeviceType::FPGA:
+            return DispatchKey::FPGA;
+          case DeviceType::MSNPU:
+            return DispatchKey::MSNPU;
+          case DeviceType::XLA:
+            return DispatchKey::XLA;
+          case DeviceType::Vulkan:
+            return DispatchKey::Vulkan;
+          default:
+            AT_ERROR("Unsupported device type for dense layout: ", device_.type());
+        }
+      }
+      case Layout::Sparse:
+        switch (device_.type()) {
+          case DeviceType::CPU:
+            return DispatchKey::SparseCPU;
+          case DeviceType::CUDA:
+            return DispatchKey::SparseCUDA;
+          case DeviceType::HIP:
+            return DispatchKey::SparseHIP;
+          default:
+            AT_ERROR("Unsupported device type for sparse layout: ", device_.type());
+        }
+      case Layout::Mkldnn:
+        switch (device_.type()) {
+          case DeviceType::CPU:
+            return DispatchKey::MkldnnCPU;
+          default:
+            AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type());
+        }
+      default:
+        AT_ERROR("Unsupported layout: ", layout_);
+    }
 }
 
 // We deliberately ignore handling AutogradCPU/CUDA/XLA... keys to
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 516aebba0747..f7f5b4f867a9 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -209,7 +209,15 @@ class C10_API DeviceGuardImplRegistrar {
   static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE(g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl());
 
 inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
-  auto p = device_guard_impl_registry[static_cast<size_t>(type)].load();
+  // Two adjacent int16_t fields DeviceType and DeviceIndex has field access
+  // miscompiled on NVCC. To workaround this issue, we apply a mask to the
+  // DeviceType. First check if the DeviceType is 16-bit.
+  // FB employees can see
+  //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
+  // for more details
+  static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
+
   // This seems to be the first place where you make use of a device
   // when you pass devices to factory functions.  Give a nicer error
   // message in this case.
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 2285a332f709..84542f064c2e 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -896,6 +896,19 @@ class THCCachingAllocator {
 
 THCCachingAllocator caching_allocator;
 
+// Returns whether to force all allocations to bypass the caching allocator and
+// go straight to cudaMalloc.  This setting is useful when debugging GPU memory
+// errors, since the caching allocator foils cuda-memcheck.
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  C10_CUDA_CHECK(cudaFree(ptr));
+}
+
 // NB: I decided not to fold this into THCCachingAllocator, because the latter
 // has a lot more methods and it wasn't altogether clear that they should
 // actually be publicly exposed
@@ -904,6 +917,10 @@ struct CudaCachingAllocator : public Allocator {
     int device;
     C10_CUDA_CHECK(cudaGetDevice(&device));
     void* r = nullptr;
+    if (forceUncachedAllocator()) {
+      C10_CUDA_CHECK(cudaMalloc(&r, size));
+      return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
+    }
     if (size != 0) {
       caching_allocator.malloc(&r, device, size, cuda::getCurrentCUDAStream(device));
     }
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 5888207c5f80..966dd22e08fa 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -113,8 +113,8 @@
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
-// Enums only need to be exported on windows
-#ifdef _WIN32
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
 #define C10_API_ENUM C10_API
 #else
 #define C10_API_ENUM
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
index 5e42506f20dc..2845fa1cd8d2 100644
--- a/c10/macros/cmake_macros.h.in
+++ b/c10/macros/cmake_macros.h.in
@@ -8,6 +8,7 @@
 #cmakedefine C10_USE_GLOG
 #cmakedefine C10_USE_GFLAGS
 #cmakedefine C10_USE_NUMA
+#cmakedefine C10_USE_MSVC_STATIC_RUNTIME
 
 // Used by libtorch mobile build to enable features that are not enabled by
 // caffe2 mobile build. Should only use it when necessary as we are committed
diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
similarity index 98%
rename from c10/core/CPUCachingAllocator.cpp
rename to c10/mobile/CPUCachingAllocator.cpp
index 232b8f2306e2..b2f193299089 100644
--- a/c10/core/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -1,4 +1,4 @@
-#include <c10/core/CPUCachingAllocator.h>
+#include <c10/mobile/CPUCachingAllocator.h>
 
 namespace c10 {
 
diff --git a/c10/core/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
similarity index 71%
rename from c10/core/CPUCachingAllocator.h
rename to c10/mobile/CPUCachingAllocator.h
index ac5f3a95c881..6a748f4f1791 100644
--- a/c10/core/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -10,6 +10,38 @@
 #include <c10/util/SmallVector.h>
 #include <c10/util/flat_hash_map.h>
 
+/*
+ * CPUCachingAllocator:
+ * DISCLAIMER:
+ *    This is subject to change (beta) and only supported on mobile builds.
+ *    If code snippet such as in 'Usage pattern' is used outside of mobile
+ *    build you will not observe the intended behavior.
+ *    See below for more information.
+ * Why?
+ *    It has been observed that some mobile platforms, such as pixel 3, return
+ *    memory aggressively to the system. This results in page faults in some cases
+ *    and ends up hurting performance. This caching allocator aims to address that.
+ *    Furthermore it also allows users to specify their own allocator by implementing
+ *    allocate/free virtual interfaces.
+ * What are the cons?
+ *    There are some cons that were observed where use of caching allocator led to
+ *    worse performance on some platforms. Reason being that the caching mechanism
+ *    used by this allocator left us worse off compared to the corresonding platform's
+ *    tuned memory allocator. In that case it seemed better to not use this allocator.
+ *    Note there are some ideas to fix this in the works.
+ *
+ * Usage:
+ * Usage pattern:
+ * Instantiate and own the caching allocator.
+ * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
+ *   std::make_unique<c10::CPUCachingAllocator>();
+ * Use caching allocator with a scoped guard at inference time.
+ * {
+ * WithCPUCachingAllocatorGuard(caching_allocator.get());
+ * ... model.forward(...);
+ * }
+ */
+
 namespace c10 {
 
 class C10_API CPUCachingAllocator {
@@ -64,16 +96,6 @@ CPUCachingAllocator* GetDefaultCPUCachingAllocator();
 bool ThreadLocalCachingAllocatorEnabled();
 CPUCachingAllocator* GetThreadLocalCachingAllocator();
 
-/*
- * Usage pattern:
- * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
- *   std::make_unique<c10::CPUCachingAllocator>();
- * {
- * WithCPUCachingAllocatorGuard(caching_allocator.get());
- * ...
- * }
- */
-
 class C10_API WithCPUCachingAllocatorGuard {
   public:
     WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp
new file mode 100644
index 000000000000..3559c8ce280f
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@@ -0,0 +1,410 @@
+#include <climits>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local AllocationPlanner* allocation_planner{nullptr};
+thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
+
+struct MemBlock {
+  uint64_t start_offset, end_offset;
+  MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
+  bool operator<(const MemBlock& other) const {
+    return end_offset <= other.start_offset;
+  }
+};
+
+bool validate_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_offsets) {
+  std::set<MemBlock> allocations;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // Skip allocations not managed by AllocationPlan
+    if (allocation_offsets[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto start_offset = allocation_offsets[i];
+    auto end_offset = allocation_offsets[i] + allocation_sizes[i];
+    if (!allocations.emplace(start_offset, end_offset).second) {
+      return false;
+    }
+  }
+  return true;
+}
+
+enum class EventType {
+  Allocate = 0,
+  Free,
+  Invalid
+};
+
+struct MemEvent {
+  uint64_t time;
+  uint64_t allocation_id;
+  uint64_t size;
+  EventType type{EventType::Invalid};
+  MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) :
+    time(t), allocation_id(id), size(s), type(e) {}
+};
+
+std::vector<MemEvent> create_and_sort_mem_events(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  std::vector<MemEvent> events;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // If observed allocation are freed outside the scope of
+    // observation, then allocations are not managed by the
+    // AllocationPlan.
+    if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
+    events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
+  }
+  std::sort(
+      events.begin(),
+      events.end(),
+      [](const MemEvent& a,
+         const MemEvent& b) -> bool {return a.time < b.time;});
+  return events;
+}
+
+std::vector<uint64_t> formulate_greedy_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  // Step 1. Construct all allocation/free events.
+  //         Sort these events by timestamp.
+  // Step 2. Iterate through all events.
+  //  2.1 If allocate event:
+  //      Find all candidate in free_size_to_offset map
+  //      Greedily pick the first one.
+  //      Remove the entry from free_size_to_offset map.
+  //      new_offset = offset + request_size
+  //      new_size = size - request_size
+  //      Add new entry to both maps
+  //  2.2 If free event.
+  //      Check if the returned offset merges with another chunk.
+  //      If so merge until no more merging is possible.
+  //      If returned offset does not merge, then
+  //      just return it as a chunk.
+
+  // lower_bound on this map will get all candidates of
+  // the right size for allocation.
+  std::map<uint64_t, uint64_t> free_size_to_offset;
+  // This provides fast lookup when we want to insert freed block
+  // back, especially when we want to merge blocks.
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_start_offset_to_size_iter;
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
+  // Upon free end_ptr = offset + size
+  // If end_ptr exists merge freed allocation
+  // Also find coresponding offset in size_to_offet
+  // Remove that entry and update with new size and offset
+  // If end_ptr does not exist then just insert offset,size
+  // in map and correspondingly size, offset in the other map.
+  // Merging should always be done recursively until no more chunks
+  // that can be found.
+  // After last free we should have only one entry left in these maps.
+  ska::flat_hash_map<uint64_t, uint64_t> allocated_offset_to_size;
+
+  std::vector<uint64_t> allocation_offsets(
+      allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
+  auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
+  uint64_t max_offset{0};
+  for (const auto& mem_event : mem_events) {
+    uint64_t alloc_offset;
+    uint64_t new_offset, new_size;
+    if (mem_event.type == EventType::Allocate) {
+      auto it = free_size_to_offset.lower_bound(mem_event.size);
+      if (it == free_size_to_offset.end()) {
+        // If there is no contiguous block of the size requested
+        // allocate a new one.
+        alloc_offset = max_offset;
+        max_offset += mem_event.size;
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      } else {
+        // If we have found a block of the size we want
+        // 1. change the block by allocating out of it.
+        //    1.1 Erase the entire block
+        //    1.2 Erase the reverse map entries
+        // 2. If block still has space left insert the remainder back in map.
+        //    Including reverse map entries.
+        // 3. Insert the allocated block in allocated_offset_to_size.
+        alloc_offset = it->second;
+        new_offset = alloc_offset + mem_event.size;
+        new_size = it->first - mem_event.size;
+        free_size_to_offset.erase(it);
+        free_start_offset_to_size_iter.erase(alloc_offset);
+        free_end_offset_to_size_iter.erase(alloc_offset + it->first);
+        if (new_size > 0) {
+          auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first;
+          free_start_offset_to_size_iter.emplace(new_offset, ref_it);
+          free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
+        }
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      }
+      allocation_offsets[mem_event.allocation_id] = alloc_offset;
+    } else {
+      // 1. Check if freed block is adjancent to an existing free block
+      //    at its end boundary. This is done by checking
+      //    free_end_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 2. Similarly check if freed block is adjacent to an existing
+      //    free block at start boundary. This is done by checking
+      //    free_start_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 3. Inser the freed block in map.
+      auto freed_offset = allocation_offsets[mem_event.allocation_id];
+      auto freed_size = mem_event.size;
+      auto end_offset = freed_offset + freed_size;
+      // Merge when another free block exist at the end of this block
+      auto end_it = free_end_offset_to_size_iter.find(end_offset);
+      if (end_it != free_end_offset_to_size_iter.end()) {
+        auto size_to_end_offset_iter = end_it->second;
+        freed_size += size_to_end_offset_iter->first;
+        free_size_to_offset.erase(size_to_end_offset_iter);
+        free_end_offset_to_size_iter.erase(end_it);
+      }
+      // Merge when freed block exist at the end of another free block
+      auto start_it = free_start_offset_to_size_iter.find(freed_offset);
+      if (start_it != free_start_offset_to_size_iter.end()) {
+        auto size_to_start_offset_iter = start_it->second;
+        freed_size += size_to_start_offset_iter->first;
+        freed_offset -= size_to_start_offset_iter->first;
+        free_size_to_offset.erase(size_to_start_offset_iter);
+        free_start_offset_to_size_iter.erase(start_it);
+      }
+      allocated_offset_to_size.erase(freed_offset);
+      auto freed_block_it =
+        free_size_to_offset.emplace(freed_size, freed_offset).first;
+      free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
+      free_end_offset_to_size_iter.emplace(
+          freed_offset + freed_size, freed_block_it);
+    }
+  }
+  TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets),
+      "Allocation plan invaild.");
+  return allocation_offsets;
+}
+
+} // namespace
+
+void AllocationPlan::clear() {
+  allocation_sizes.clear();
+  allocation_lifetimes.clear();
+  allocation_offsets.clear();
+}
+
+void AllocationPlanner::record_allocation(
+    const uint64_t size, const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_allocation(size, ptr);
+    return;
+  }
+  allocation_plan_->allocation_sizes.push_back(size);
+  allocation_plan_->allocation_lifetimes.push_back(
+      std::numeric_limits<uint64_t>::max());
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+}
+
+void AllocationPlanner::record_free(const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_free(ptr);
+    return;
+  }
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Free being recorded was allocated outside of WithProfileAllocationGuard
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during record_allocation.");
+  allocation_plan_->allocation_lifetimes[id] = allocation_id_;
+}
+
+bool AllocationPlanner::validate_allocation(
+    const uint64_t size, const void* ptr) {
+  if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
+      allocation_plan_->allocation_sizes[allocation_id_] != size) {
+    TORCH_WARN(
+        "Allocation request does not match plan:",
+        "Allocation id:",
+        allocation_id_,
+        ", Number of recorded allocations:",
+        allocation_plan_->allocation_sizes.size(),
+        ", Recorded size of the requested allocation:",
+        allocation_plan_->allocation_sizes[allocation_id_],
+        ", but got:",
+        size);
+
+    return false;
+  }
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+  return true;
+}
+
+bool AllocationPlanner::validate_free(const void* ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Allocation that was made outside the validation scope is being freed here
+    return true;
+  }
+  auto id = (*it).second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during validate_allocation.");
+  auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
+  return (lifetime_id == allocation_id_);
+}
+
+void AllocationPlanner::formulate_plan() {
+  allocation_plan_->allocation_offsets =
+    formulate_greedy_allocation_plan(
+        allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes);
+  allocation_plan_->total_size = 0;
+  for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) {
+    if (allocation_plan_->allocation_lifetimes[i] ==
+        std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i];
+    allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit);
+  }
+}
+
+void AllocationPlanner::clear() {
+  allocation_plan_->clear();
+  allocation_ptr_to_id_.clear();
+}
+
+void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
+  TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
+  plan_ = plan;
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  if (current_size_ < plan->total_size) {
+    // Free existing memory and reallocate for larger size.
+    c10::free_cpu(blob_);
+    blob_ = c10::alloc_cpu(plan->total_size);
+    current_size_ = plan->total_size;
+  }
+}
+
+void CPUProfilingAllocator::unset_plan() {
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  plan_ = nullptr;
+}
+
+void* CPUProfilingAllocator::allocate(const size_t bytes) {
+  TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_],
+      "Got allocation request that does not match with the plan.");
+  if (plan_->allocation_lifetimes[allocation_id_] ==
+      std::numeric_limits<uint64_t>::max()) {
+    // This allocation is not managed by ProfilingAllocator.
+    allocation_id_++;
+    return c10::alloc_cpu(bytes);
+  }
+  void* ptr =
+    reinterpret_cast<uint8_t*>(blob_) +
+    plan_->allocation_offsets[allocation_id_];
+  TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second);
+  allocation_id_++;
+  return ptr;
+}
+
+void CPUProfilingAllocator::free(void* const ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Either
+    // 1. Allocation that was made outside the validation scope is being freed here
+    // or
+    // 2. Allocation that is not managed by profiling allocator is being freed.
+    //    Example of the second type
+    //    Tensor out;
+    //    for (....) {
+    //      {
+    //        CPUProfilingAllocator
+    //        out = ...some op (This also frees previous memory held by out)
+    //      }
+    //      out is used..
+    //    }
+    c10::free_cpu(ptr);
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < plan_->allocation_lifetimes.size(),
+      "Freeing allocation that is not accordingly to the plan.");
+  auto lifetime_id = plan_->allocation_lifetimes[id];
+  TORCH_CHECK(
+      lifetime_id == allocation_id_,
+      "Lifetime of allocations do not match: allocation_id ",
+      id,
+      ", expected:",
+      lifetime_id,
+      ", got:",
+      allocation_id_);
+}
+
+CPUProfilingAllocator::~CPUProfilingAllocator() {
+  c10::free_cpu(blob_);
+}
+
+WithProfileAllocationsGuard::WithProfileAllocationsGuard(
+    AllocationPlan* plan) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan);
+  planner_->clear();
+  allocation_planner = planner_.get();
+}
+
+WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
+  planner_->formulate_plan();
+  allocation_planner = nullptr;
+}
+
+WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
+    AllocationPlan* plan, bool* success) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan, true);
+  success_ = success;
+  allocation_planner = planner_.get();
+}
+
+WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
+  *success_ = planner_->validation_success;
+  allocation_planner = nullptr;
+}
+
+AllocationPlanner* GetThreadLocalAllocationPlanner() {
+  return allocation_planner;
+}
+
+WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
+    CPUProfilingAllocator* allocator, const AllocationPlan* plan) {
+  // Nesting of profiling allocator is not supported.
+  TORCH_CHECK(profiling_allocator == nullptr,
+      "Nesting profiling allocators is not supported.");
+  profiling_allocator = allocator;
+  profiling_allocator->set_plan(plan);
+}
+
+WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
+  profiling_allocator->unset_plan();
+  profiling_allocator = nullptr;
+}
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
+  return profiling_allocator;
+}
+
+} // namespace c10
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
new file mode 100644
index 000000000000..4a7e79fe2857
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+/*
+ * Given a sequence of allocations in a thread, AllocationPlan records
+ * 1. size of each allocation
+ * 2. Lifetime of each allocation.
+ * 3. allocation offsets: Memory offset for each allocation in a single blob of memory
+ * 4. Total size of a blob of memory required to satisfy all the allocations.
+ */
+class C10_API AllocationPlan {
+  private:
+    // Records size of each allocation by their sequential allocation ids.
+    std::vector<uint64_t> allocation_sizes;
+    // This maps one allocation id (X) to another allocation id (Y).
+    // Allocation X is alive until allocation Y. From allocation Y onwards
+    // allocation X is not referenced.
+    // Thus Y is the id of the first allocation after X is freed.
+    // NB: When an allocation is recorded, along with recording its size,
+    // we also set the lifetime to be numeric_limits::max()
+    // This is to track allocations that are made during the scope of
+    // profiling but were not freed until after the scope ended.
+    // Such allocations are not managed by profiling allocator.
+    std::vector<uint64_t> allocation_lifetimes;
+    // Maps an allocation to some offset in a blob of memory.
+    std::vector<uint64_t> allocation_offsets;
+    uint64_t total_size{0};
+    void clear();
+    friend class AllocationPlanner;
+    friend class CPUProfilingAllocator;
+};
+
+/*
+ * Map of memory ptr to allocation id. This is auxiliary information only
+ * used to establish lifetime of allocations.
+ */
+class C10_API AllocationPlanner {
+  private:
+    AllocationPlan* allocation_plan_{nullptr};
+    // Maps allocated ptr to its allocation id.
+    // This is used when freeing the memory to lookup the allocation id
+    // in order to establish the lifetime of a particular allocation.
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+    uint64_t allocation_id_{0};
+    bool validation_mode_{false};
+
+    bool validate_allocation(const uint64_t size, const void* ptr);
+    bool validate_free(const void* ptr);
+  public:
+    bool validation_success{true};
+
+    AllocationPlanner() = delete;
+    AllocationPlanner(AllocationPlan* plan, bool validate = false) :
+      allocation_plan_(plan), validation_mode_(validate) {}
+    void record_allocation(const uint64_t size, const void* ptr);
+    void record_free(const void* ptr);
+    void formulate_plan();
+    void clear();
+};
+
+// NOT THREAD SAFE profiling allocator.
+class C10_API CPUProfilingAllocator {
+  private:
+    const AllocationPlan* plan_{nullptr};
+    uint64_t allocation_id_{0};
+    uint64_t current_size_{0};
+    void* blob_{nullptr};
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+  public:
+    ~CPUProfilingAllocator();
+    void set_plan(const AllocationPlan* plan);
+    void unset_plan();
+    void* allocate(const size_t bytes);
+    void free(void* const ptr);
+};
+
+/*
+ * Usage: Profile allocations made by one run of the model.
+ * AllocationPlan plan;
+ * {
+ *   WithProfileAllocationGuard profile_guard(&plan);
+ *   module.forward(...);
+ * }
+ * plan now contains allocation plan.
+ */
+class C10_API WithProfileAllocationsGuard {
+  public:
+    WithProfileAllocationsGuard(AllocationPlan* plan);
+    ~WithProfileAllocationsGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+};
+
+/*
+ * Usage: Validate allocation plan made with WithProfileAllocationGuard
+ * bool plan_validation_success, success = true;
+ * for (some number of representative inputs)
+ * {
+ *   WithValidateAllocationPlanGuard(&plan, &plan_validation_success);
+ *   module.forward(...);
+ *   success = success && plan_validation_success;
+ * }
+ * success == true means allocations are according to plan
+ * else for some inputs allocation pattern changed.
+ */
+class C10_API WithValidateAllocationPlanGuard {
+  public:
+    WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success);
+    ~WithValidateAllocationPlanGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+    bool* success_;
+};
+
+AllocationPlanner* GetThreadLocalAllocationPlanner();
+
+/*
+ * Usage: Allocate tensors accordingly to allocation plan
+ * First make allocation plan.
+ *  See WithProfileAllocationsGuard usage.
+ * Second validate allocation plan.
+ *  See WithValidateAllocationPlanGuard usage.
+ * CPUProfilingAllocator profiling_allocator;
+ * {
+ *   WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan);
+ *   module.forward(...);
+ * }
+ */
+class C10_API WithProfilingAllocatorGuard {
+  public:
+    WithProfilingAllocatorGuard(
+        CPUProfilingAllocator* allocator, const AllocationPlan* plan);
+    ~WithProfilingAllocatorGuard();
+};
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator();
+
+} // namespace c10
diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h
index da6ce3859552..57e2a69b86fb 100644
--- a/c10/util/BFloat16-inl.h
+++ b/c10/util/BFloat16-inl.h
@@ -7,15 +7,44 @@ namespace c10 {
 
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  x = __bfloat16_as_ushort(__float2bfloat16(value));
+#else
   // RNE by default
   x = detail::round_to_nearest_even(value);
+#endif
 }
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
   return detail::f32_from_bits(x);
+#endif
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16 *>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
 /// Arithmetic
 
 inline C10_HOST_DEVICE BFloat16 operator+(const BFloat16& a, const BFloat16& b) {
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 375b1086e073..0bd115d568f6 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -7,6 +7,10 @@
 #include <cmath>
 #include <cstring>
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
 namespace c10 {
 
 namespace detail {
@@ -84,6 +88,11 @@ struct alignas(2) BFloat16 {
   constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t) : x(bits){};
   inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
 };
 
 } // namespace c10
diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp
index a9cdc26b5934..20d473667a8d 100644
--- a/c10/util/ThreadLocalDebugInfo.cpp
+++ b/c10/util/ThreadLocalDebugInfo.cpp
@@ -51,6 +51,15 @@ std::shared_ptr<DebugInfoBase> ThreadLocalDebugInfo::_pop(DebugInfoKind kind) {
   return res->info_;
 }
 
+/* static */
+std::shared_ptr<DebugInfoBase> ThreadLocalDebugInfo::_peek(DebugInfoKind kind) {
+  TORCH_CHECK(
+      debug_info && debug_info->kind_ == kind,
+      "Expected debug info of type ",
+      (size_t)kind);
+  return debug_info->info_;
+}
+
 
 DebugInfoGuard::DebugInfoGuard(
     DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info) {
diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index 207abed781b0..9620cfb9fdea 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -46,6 +46,9 @@ class C10_API ThreadLocalDebugInfo {
   // Pop debug info, throws in case the last pushed
   // debug info is not of a given kind
   static std::shared_ptr<DebugInfoBase> _pop(DebugInfoKind kind);
+  // Peek debug info, throws in case the last pushed debug info is not of the
+  // given kind
+  static std::shared_ptr<DebugInfoBase> _peek(DebugInfoKind kind);
 
  private:
   std::shared_ptr<DebugInfoBase> info_;
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 53ec4f30e539..9c63a2b296fb 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -257,6 +257,11 @@ struct alignas(sizeof(T) * 2) complex {
   }
 #endif
 
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
   constexpr T real() const {
     return real_;
   }
diff --git a/c10/util/quint4x2.h b/c10/util/quint4x2.h
new file mode 100644
index 000000000000..c2502b561409
--- /dev/null
+++ b/c10/util/quint4x2.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte boundary.
+ */
+struct alignas(1) quint4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint4x2() = default;
+  C10_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index e97eaa843979..e2070a1584a2 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -88,5 +88,6 @@ CAFFE_KNOWN_TYPE(c10::qint8)
 CAFFE_KNOWN_TYPE(c10::quint8)
 CAFFE_KNOWN_TYPE(c10::qint32)
 CAFFE_KNOWN_TYPE(at::BFloat16)
+CAFFE_KNOWN_TYPE(c10::quint4x2)
 
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 62a0bdfc6644..51833fb545ad 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -29,6 +29,7 @@
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
+#include <c10/util/quint4x2.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/flat_hash_map.h>
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 65f072b6f29d..318e46a44f54 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -6,7 +6,7 @@ if(USE_VULKAN)
   include(../cmake/VulkanCodegen.cmake)
 endif()
 
-# ---[ MSVC OpenMP modification 
+# ---[ MSVC OpenMP modification
 if(MSVC)
   include(../cmake/public/utils.cmake)
 endif()
@@ -111,7 +111,7 @@ endif()
 add_subdirectory(core)
 add_subdirectory(serialize)
 add_subdirectory(utils)
-if(BUILD_CAFFE2)
+if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
   add_subdirectory(perfkernels)
 endif()
 
@@ -291,26 +291,29 @@ endif()
 
 if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if(USE_DISTRIBUTED)
-    add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
-    target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
-    add_dependencies(process_group_agent torch c10d)
 
     # Define this target even if we're building without TensorPipe, to make life
     # easier to other targets that depend on this. However, in that case, by not
     # setting the USE_TENSORPIPE compile definition, this target will just end
     # up being empty. Downstream targets should also add a #ifdef guard.
-    add_library(tensorpipe_agent
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
-      )
-    target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
-    add_dependencies(tensorpipe_agent torch c10d)
-    if(USE_TENSORPIPE)
-      target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
-      target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
-      add_dependencies(tensorpipe_agent tensorpipe)
+    if(NOT WIN32)
+      add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
+      target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
+      add_dependencies(process_group_agent torch c10d)
+
+      add_library(tensorpipe_agent
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
+        )
+      target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
+      add_dependencies(tensorpipe_agent torch c10d)
+      if(USE_TENSORPIPE)
+        target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
+        target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
+        add_dependencies(tensorpipe_agent tensorpipe)
+      endif()
     endif()
   endif()
 
@@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
         PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
       )
     endif()
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       append_filelist("libtorch_distributed_sources" TORCH_SRCS)
     endif()
   endif()
@@ -506,6 +509,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp
@@ -515,6 +519,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/instrumentation.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -524,7 +529,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -834,10 +841,10 @@ endif()
     DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 
-  if(BUILD_TEST AND NOT USE_ROCM)
+  if(BUILD_TEST)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr)
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
   endif()
@@ -889,9 +896,7 @@ endif()
     DESTINATION share/cmake/Torch)
 
   if(USE_DISTRIBUTED)
-    if(NOT MSVC)
-      add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
-    endif()
+    add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
   endif()
 
 
@@ -966,6 +971,14 @@ if(USE_DISTRIBUTED)
   target_compile_definitions(torch_cpu PRIVATE
     USE_DISTRIBUTED
   )
+  # Pass USE_RPC in order to reduce use of
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PRIVATE
+      USE_RPC
+    )
+  endif()
   # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
   # can only be compiled with USE_TENSORPIPE is set.
   if(USE_TENSORPIPE)
@@ -1243,7 +1256,9 @@ endif()
 if(BUILD_STATIC_RUNTIME_BENCHMARK)
   add_subdirectory(${TORCH_ROOT}/benchmarks/static_runtime ${PROJECT_BINARY_DIR}/bin)
   add_executable(static_runtime_bench "${STATIC_RUNTIME_BENCHMARK_SRCS}")
+  add_executable(static_runtime_test "${STATIC_RUNTIME_TEST_SRCS}")
   target_link_libraries(static_runtime_bench torch_library benchmark)
+  target_link_libraries(static_runtime_test torch_library gtest_main)
 endif()
 
 if(BUILD_MOBILE_BENCHMARK)
@@ -1276,8 +1291,8 @@ if(BUILD_TEST)
   foreach(test_src ${ATen_VEC256_TEST_SRCS})
     foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
         get_filename_component(test_name ${test_src} NAME_WE)
-        list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) 
-        list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)  
+        list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
+        list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
         separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
         add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
         target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
@@ -1287,7 +1302,7 @@ if(BUILD_TEST)
         target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY}  CPU_CAPABILITY_${CPU_CAPABILITY})
         target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE  ${FLAGS})
         if(NOT MSVC)
-              target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers) 
+              target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers)
         endif(NOT MSVC)
         add_test(NAME ${test_name}_${CPU_CAPABILITY} COMMAND $<TARGET_FILE:${test_name}_${CPU_CAPABILITY}>)
     endforeach()
diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py
index 92448fe355de..d9d99a1c1ae9 100644
--- a/caffe2/contrib/aten/aten_test.py
+++ b/caffe2/contrib/aten/aten_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, dyndep
 from hypothesis import given
diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
index a8979ca63aa6..94a76fed85f5 100644
--- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
index 1a4f57b6aa05..7b1b5f070171 100644
--- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
index 511c29884288..b7a9fc810cfc 100644
--- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
index bb013a26a609..7a68af63a84b 100644
--- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
index 22e78b0756c0..45757badba43 100644
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
index 4c82917f042c..5a91a00706ff 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.fakelowp.init_shared_libs  # noqa
 import numpy as np
diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py
index 83d0cc176def..02095286e1ee 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_quant.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
index 698b839f3785..9ff0986116b6 100644
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
index 58161409fa80..e8512b4dcd74 100644
--- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
index 0ca76bd86ba9..a8d6640fa58e 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
index da7eae2708f3..f8fd03cbfb73 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
index ad26952a901c..207403f1bd0d 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
index 8eaff9e137ae..fbca9b8fe64c 100644
--- a/caffe2/contrib/gloo/gloo_test.py
+++ b/caffe2/contrib/gloo/gloo_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index 3f4685548281..2d4e9b518b9b 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
index b12acd151a71..4bedf0e0ecd6 100644
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
index 5d968b0455fc..b8e2f8b37b2a 100644
--- a/caffe2/contrib/playground/AnyExp.py
+++ b/caffe2/contrib/playground/AnyExp.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import abstractmethod
 
diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py
index b269777da675..dcfe61f14545 100644
--- a/caffe2/contrib/playground/AnyExpOnTerm.py
+++ b/caffe2/contrib/playground/AnyExpOnTerm.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import json
diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py
index 89a9deb8989e..27e0c07f6384 100644
--- a/caffe2/contrib/playground/ModuleRegister.py
+++ b/caffe2/contrib/playground/ModuleRegister.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import inspect
 import logging
diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py
index 9887a408cc01..5ea3d2a9035c 100644
--- a/caffe2/contrib/playground/checkpoint.py
+++ b/caffe2/contrib/playground/checkpoint.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import pickle
diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py
index 53eb77d77701..2965ff3895ac 100644
--- a/caffe2/contrib/playground/compute_loss.py
+++ b/caffe2/contrib/playground/compute_loss.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.contrib.playground.meter as Meter
 from caffe2.python import workspace
diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py
index 396b797ed1b6..e2f148231c6d 100644
--- a/caffe2/contrib/playground/compute_topk_accuracy.py
+++ b/caffe2/contrib/playground/compute_topk_accuracy.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.contrib.playground.meter as Meter
 from caffe2.python import workspace
diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py
index 7e109e445d04..ed0158bbf087 100644
--- a/caffe2/contrib/playground/meter.py
+++ b/caffe2/contrib/playground/meter.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import abstractmethod
 
diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py
index 0f5de5943a36..8eb1a3a00cdc 100644
--- a/caffe2/contrib/playground/module_map.py
+++ b/caffe2/contrib/playground/module_map.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 # Input
 import caffe2.contrib.playground.resnetdemo.\
diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py
index 41d8e3fdfae4..aaa977c08faa 100644
--- a/caffe2/contrib/playground/output_generator.py
+++ b/caffe2/contrib/playground/output_generator.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import timeout_guard
 
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
index 52ce95ed5dab..58085dbc3721 100644
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
index cf893b598446..480070752e63 100644
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
index 174ffe1e034a..fa0fedd84a8c 100644
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python.models.resnet as resnet
 
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
index 974653446a22..5697d1301b8a 100644
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def gen_param_update_builder_fun(self, model, dataset, is_train):
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
index 01b51fa8450c..056ddd8c9ea0 100644
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 logging.basicConfig()
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
index 8a86289778ee..5378acd61886 100644
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
index 8b2647114b63..496ac22ffde5 100644
--- a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
+++ b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 # # example1 using gfs as input source.
 
diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
index 4cc2d68cbfd7..419d6a25e95b 100644
--- a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
+++ b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 def checkpoint(self, epoch):
     self.model_path = None
diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
index d757896793ff..0a56d68257ee 100644
--- a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
+++ b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python import dyndep
diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py
index 2953503bbea5..c77b7ae88ba6 100644
--- a/caffe2/contrib/prof/cuda_profile_ops_test.py
+++ b/caffe2/contrib/prof/cuda_profile_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py
index 9aece77bc09a..6f5ad1896e35 100644
--- a/caffe2/contrib/tensorboard/tensorboard.py
+++ b/caffe2/contrib/tensorboard/tensorboard.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import click
 import collections
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index a3c0e0e59723..ef12ce563cde 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from builtins import bytes
 import copy
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
index 6b9c894e16fb..31ef8180fb57 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py
index 494cb6fc7d12..8751be14ead5 100644
--- a/caffe2/contrib/tensorboard/tensorboard_test.py
+++ b/caffe2/contrib/tensorboard/tensorboard_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import click.testing
 import numpy as np
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 3b21c8b66747..013e80a98773 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py
index 49cd2abb2cef..fbe1c8da377e 100755
--- a/caffe2/core/nomnigraph/op_gen.py
+++ b/caffe2/core/nomnigraph/op_gen.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 from textwrap import dedent
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 18a7be64d670..27f8b471b71b 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -70,7 +70,7 @@ class CAFFE2_API Tensor final {
   explicit Tensor(at::Device device)
       : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
             Storage::create_legacy(device),
-            c10::computeDispatchKey(at::device(device).layout(at::kStrided)),
+            c10::computeDispatchKey(c10::nullopt, at::kStrided, device),
             TypeMeta())) {}
 
   /**
diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
index 5a749c304d2b..5a34e53b6947 100644
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@@ -122,6 +122,16 @@ int64_t FileStoreHandler::add(
   return 0;
 }
 
+int64_t FileStoreHandler::getNumKeys() {
+  CHECK(false) << "getNumKeys not implemented for FileStoreHandler";
+  return 0;
+}
+
+bool FileStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for FileStoreHandler";
+  return false;
+}
+
 bool FileStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> paths;
   for (const auto& name : names) {
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index b58b156e51b0..9ca81e4c2c7d 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -17,6 +17,10 @@ class CAFFE2_API FileStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual bool deleteKey(const std::string& key) override;
+
+  virtual int64_t getNumKeys() override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py
index 2e90c548d50f..427b68420d39 100644
--- a/caffe2/distributed/file_store_handler_op_test.py
+++ b/caffe2/distributed/file_store_handler_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import errno
 import os
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
index 7caaa6c79de7..e424c0e719fd 100644
--- a/caffe2/distributed/redis_store_handler.cc
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -76,6 +76,16 @@ int64_t RedisStoreHandler::add(const std::string& name, int64_t value) {
   return reply->integer;
 }
 
+int64_t RedisStoreHandler::getNumKeys() {
+  CHECK(false) << "getNumKeys not implemented for RedisStoreHandler";
+  return 0;
+}
+
+bool RedisStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for RedisStoreHandler";
+  return false;
+}
+
 bool RedisStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> args;
   args.push_back("EXISTS");
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index 0caa888a6629..d5fa76741578 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -23,6 +23,10 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual int64_t getNumKeys() override;
+
+  virtual bool deleteKey(const std::string& key) override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py
index 3df69bf2701a..8f5d58e85185 100644
--- a/caffe2/distributed/redis_store_handler_op_test.py
+++ b/caffe2/distributed/redis_store_handler_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import uuid
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index e11ea57aea3d..951fe26c6ec6 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -41,6 +41,16 @@ class CAFFE2_API StoreHandler {
    */
   virtual int64_t add(const std::string& name, int64_t value) = 0;
 
+  /*
+   * Returns the number of keys in this store.
+   */
+  virtual int64_t getNumKeys() = 0;
+
+  /*
+   * Removes the specified key from the store.
+   */
+  virtual bool deleteKey(const std::string& key) = 0;
+
   /*
    * Check if a keys exist in the store.
    */
diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py
index 2abe697cface..05245be9b210 100644
--- a/caffe2/distributed/store_ops_test_util.py
+++ b/caffe2/distributed/store_ops_test_util.py
@@ -1,9 +1,9 @@
 ## @package store_ops_test_util
 # Module caffe2.distributed.store_ops_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from multiprocessing import Process, Queue
 
diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py
index ff9ab7715c33..d97f076a7bb3 100644
--- a/caffe2/experiments/python/SparseTransformer.py
+++ b/caffe2/experiments/python/SparseTransformer.py
@@ -15,10 +15,10 @@
 
 ## @package SparseTransformer
 # Module caffe2.experiments.python.SparseTransformer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace
 import scipy.sparse
 
diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py
index 386c9c4b7ebc..ff9b7a20bc73 100644
--- a/caffe2/experiments/python/convnet_benchmarks.py
+++ b/caffe2/experiments/python/convnet_benchmarks.py
@@ -15,10 +15,10 @@
 
 ## @package convnet_benchmarks
 # Module caffe2.experiments.python.convnet_benchmarks
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 """
 Benchmark for common convnets.
 
diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py
index dbe0dae4f0c2..1a795e2fcf0e 100644
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@@ -15,10 +15,10 @@
 
 ## @package device_reduce_sum_bench
 # Module caffe2.experiments.python.device_reduce_sum_bench
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import itertools
diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py
index 6a4eb0e6b5b5..3fc4c8bf54fd 100644
--- a/caffe2/experiments/python/funhash_op_test.py
+++ b/caffe2/experiments/python/funhash_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py
index b7cf605c0c04..ec12517c03be 100644
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@@ -15,10 +15,10 @@
 
 ## @package net_construct_bench
 # Module caffe2.experiments.python.net_construct_bench
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import logging
diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py
index 2af006249c7d..cfc7a0bb6165 100644
--- a/caffe2/experiments/python/sparse_funhash_op_test.py
+++ b/caffe2/experiments/python/sparse_funhash_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py
index 5849580f09e1..a22bf561ce86 100644
--- a/caffe2/experiments/python/sparse_reshape_op_test.py
+++ b/caffe2/experiments/python/sparse_reshape_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py
index 4cd04a16ea23..1e41e9ed8ddd 100644
--- a/caffe2/experiments/python/tt_contraction_op_test.py
+++ b/caffe2/experiments/python/tt_contraction_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py
index 10be7adcb453..27d13543348b 100644
--- a/caffe2/experiments/python/tt_pad_op_test.py
+++ b/caffe2/experiments/python/tt_pad_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/operators/bucketize_op.cu b/caffe2/operators/bucketize_op.cu
index 1d48013e771d..5d3049f239fb 100644
--- a/caffe2/operators/bucketize_op.cu
+++ b/caffe2/operators/bucketize_op.cu
@@ -15,7 +15,7 @@ __global__ void BucketizeOpKernel(
   CUDA_1D_KERNEL_LOOP(i, N) {
     int32_t low = -1, high = M;
     while (high - low > 1) {
-      int32_t median = (high + low) / 2;
+      const int32_t median = low + (high - low) / 2;
       if (bounds[median] < X[i]) {
         low = median;
       } else {
diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc
index 10396aafc97e..aa31ef12b36a 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.cc
+++ b/caffe2/operators/gather_ranges_to_dense_op.cc
@@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense);
 
 } // namespace
 } // namespace caffe2
+
+using GatherRangesToDenseCPUOp =
+    caffe2::GatherRangesToDenseOp<caffe2::CPUContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    GatherRangesToDense,
+    "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs",
+    GatherRangesToDenseCPUOp);
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index c1dd5a527005..217a61b25129 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -5,6 +5,7 @@
 
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -15,6 +16,8 @@
 #include <map>
 #include <utility>
 
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense);
+
 namespace caffe2 {
 template <class Context>
 class GatherRangesToDenseOp final : public Operator<Context> {
diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
index e1e8ec0693d6..543ad8dd0b34 100644
--- a/caffe2/operators/layer_norm_op.h
+++ b/caffe2/operators/layer_norm_op.h
@@ -52,6 +52,11 @@ class LayerNormOp final : public Operator<Context> {
     T* sigma_data = sigma->template mutable_data<T>();
     T* scale_data = scale_.template mutable_data<T>();
     T* bias_data = bias_.template mutable_data<T>();
+
+    if (M == 0) {
+      return true;
+    }
+
     const std::array<int, 2> X_dims = {M, N};
     const std::array<int, 2> Y_dims = {M, 1};
     math::Moments<T, Context>(
@@ -174,6 +179,16 @@ class LayerNormGradientOp final : public Operator<Context> {
       g_scale_data = g_scale_.template mutable_data<T>();
     }
 
+    if (M == 0) {
+      if (N > 0 && dgamma_data != nullptr) {
+        math::Set<T, Context>(N, T(0), dgamma_data, &context_);
+      }
+      if (N > 0 && dbeta_data != nullptr) {
+        math::Set<T, Context>(N, T(0), dbeta_data, &context_);
+      }
+      return true;
+    }
+
     ComputeInternalGradients<T>(
         M, N, dY_data, X_data, gamma_data, dX_data, ds_data, db_data);
     ComputeFusedParams<T>(
diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h
index f16914f4a894..beb0b0440505 100644
--- a/caffe2/operators/mean_op.h
+++ b/caffe2/operators/mean_op.h
@@ -65,9 +65,11 @@ class MeanOp final : public Operator<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double>();
     } else {
       CAFFE_THROW(
-          "Mean operator only supports 32-bit float, but",
+          "Mean operator only supports 32-bit float or 64-bit double, but",
           " input was of type ",
           Input(0).dtype().name());
     }
@@ -111,9 +113,11 @@ class MeanGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double>();
     } else {
       CAFFE_THROW(
-          "Mean operator only supports 32-bit float, but",
+          "Mean operator only supports 32-bit float or 64-bit double, but",
           " input was of type ",
           Input(0).dtype().name());
     }
diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
index 7f3b1155e1b3..6a9b2bab0ec3 100644
--- a/caffe2/operators/roi_align_gradient_op.cc
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -191,7 +191,7 @@ void ROIAlignBackwardFeature(
 } // namespace
 
 template <>
-bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
index babf06d759eb..09f56e3269e7 100644
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -190,7 +190,7 @@ __global__ void RoIAlignBackwardFeature(
 } // namespace
 
 template <>
-bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
index 997eb1404b2e..55cbb47be81c 100644
--- a/caffe2/operators/roi_align_op.cc
+++ b/caffe2/operators/roi_align_op.cc
@@ -84,7 +84,7 @@ std::vector<BilinearInterpolationParam<T>> MakeBilinearInterpolationParams(
 } // namespace
 
 template <>
-bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
+C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
     int64_t N,
     int64_t C,
     int64_t H,
@@ -170,7 +170,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
 }
 
 template <>
-bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
+C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
     int64_t N,
     int64_t C,
     int64_t H,
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
index 62d7842e2ae3..4d0edd3a408c 100644
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@@ -149,7 +149,7 @@ __global__ void RoIAlignForward(
 } // namespace
 
 template <>
-bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
                       // RoI pooled data
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu
index 1ca0b73c72fa..cc16a828858f 100644
--- a/caffe2/operators/roi_align_rotated_gradient_op.cu
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cu
@@ -198,7 +198,7 @@ __global__ void RoIAlignRotatedBackward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_rotated_op.cc b/caffe2/operators/roi_align_rotated_op.cc
index c94d0f11bd1f..73464f1fe6ee 100644
--- a/caffe2/operators/roi_align_rotated_op.cc
+++ b/caffe2/operators/roi_align_rotated_op.cc
@@ -291,7 +291,7 @@ void ROIAlignRotatedForward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
 
diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu
index 96e4797c597c..67c1d38f51b4 100644
--- a/caffe2/operators/roi_align_rotated_op.cu
+++ b/caffe2/operators/roi_align_rotated_op.cu
@@ -158,7 +158,7 @@ __global__ void RoIAlignRotatedForward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
 
diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc
index 95a6cbfa386c..d0018b03f4a6 100644
--- a/caffe2/operators/roi_pool_op.cc
+++ b/caffe2/operators/roi_pool_op.cc
@@ -8,7 +8,7 @@ using std::max;
 using std::min;
 
 template <>
-bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
   const auto& X = Input(0); // Input data to pool
   const auto& R = Input(1); // RoIs
   auto* Y = Output(0); // RoI pooled data
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
index af479f8a5881..7c1ef1316623 100644
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@@ -167,7 +167,7 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
 }
 
 template <>
-bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& A = Input(2); // argmaxes
diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index d968112c9ecc..b842d09e068d 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -45,21 +45,21 @@ Convert sparse representations to dense with given indices.
 
 Transforms a sparse representation of map<id, value> represented as `indices`
 vector and `values` tensor into a compacted tensor where the first dimension
-corresponds to each id provided in mask argument. Missing values are filled with
-the value of `default_value`. After running this op:
+corresponds to each id provided in the mask argument. Missing values are filled
+with the value of `default_value`. After running this op:
 
   output[j, :] = values[i] // where mask[j] == indices[i]
   output[j, ...] = default_value // when mask[j] doesn't appear in indices
 
-If `lengths` is provided and not empty, and extra "batch" dimension is prepended
+If `lengths` is provided and not empty, an extra "batch" dimension is prepended
 to the output.
 
-`values` and `default_value` can have additional matching dimensions, operation
-is performed on the entire subtensor in thise case.
+`values` and `default_value` can have additional matching dimensions
+(the operation is performed on the entire subtensor in this case).
 
-For example, if `lengths` is supplied and `values` is 1-D vector of floats and
-`default_value` is a float scalar, the output is going to be a float matrix
-of size `len(lengths) X len(mask)`
+For example, if `lengths` is supplied and `values` is a 1-D vector of floats
+and `default_value` is a float scalar, the output is going to be a float
+matrix of size `len(lengths) X len(mask)`.
 )DOC")
     .Arg(
         "mask",
@@ -67,6 +67,10 @@ of size `len(lengths) X len(mask)`
     .Arg(
         "return_presence_mask",
         "bool whether to return presence mask, false by default")
+    .Arg(
+        "max_skipped_indices",
+        "int argument representing the maximum number of invalid row ids that "
+        "can be skipped before returning an error. 50 by default")
     .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
     .Input(1, "values", "Data tensor, first dimension has to match `indices`")
     .Input(
@@ -117,3 +121,18 @@ class GetSparseToDenseMaskGradient : public GradientMakerBase {
 REGISTER_GRADIENT(SparseToDenseMask, GetSparseToDenseMaskGradient);
 } // namespace
 } // namespace caffe2
+
+// clang-format off
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    SparseToDenseMask,
+    "_caffe2::SparseToDenseMask("
+      "Tensor indices, "
+      "Tensor values, "
+      "Tensor default_value, "
+      "Tensor? lengths, "
+      "int[] mask, "
+      "bool? return_presence_mask = False, "
+      "int? max_skipped_indices = 50"
+    ") -> (Tensor output, Tensor presence_mask)",
+    caffe2::SparseToDenseMaskOp<caffe2::CPUContext>);
+// clang-format on
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 8ed589c6d734..26213c0cff33 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -5,10 +5,13 @@
 #include <unordered_map>
 #include <vector>
 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/math.h"
 
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(SparseToDenseMask);
+
 namespace caffe2 {
 
 template <class Context>
diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
index d37717d5b957..d8fe956a0ddd 100644
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@@ -857,7 +857,8 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
   try {
     const static std::unordered_set<std::string>
         types_with_independent_output_shape = {"Int8GenQuantParams",
-                                               "Int8QuantSchemeBlobFill"};
+                                               "Int8QuantSchemeBlobFill",
+                                               "ComputeEqualizationScale"};
     std::vector<TensorShape> input_shapes;
     for (const auto& input : op.input()) {
       const auto it = shape_info_.find(input);
@@ -883,6 +884,7 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
     bool is_quantized = !(op.type().compare(0, 4, "Int8")) &&
         (op.type() != "Int8Dequantize") &&
         (op.type() != "Int8QuantSchemeBlobFill") &&
+        (op.type() != "ComputeEqualizationScale") &&
         (op.type() != "Int8GenQuantParams");
     float scale = 1;
     int offset = 0;
diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
index 6a211a604d52..f19403a14e58 100644
--- a/caffe2/opt/onnxifi_op.h
+++ b/caffe2/opt/onnxifi_op.h
@@ -263,10 +263,13 @@ class OnnxifiOp final : public Operator<Context> {
         defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__");
       }
       onnxGraph graph{nullptr};
+
+      static const uint64_t auxPropertiesListAOT[] = {
+          ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE};
       CAFFE_ENFORCE_EQ(
           lib_->onnxInitGraph(
               backend,
-              nullptr,
+              use_glow_aot_ ? auxPropertiesListAOT : nullptr,
               onnx_model_str.size(),
               (const void*)(onnx_model_str.c_str()),
               weight_descs.size(),
diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc
index 0ff55693395f..dfcdeb0356bd 100644
--- a/caffe2/opt/shape_info.cc
+++ b/caffe2/opt/shape_info.cc
@@ -5,6 +5,63 @@
 
 namespace caffe2 {
 
+namespace {
+bool isNumber(const std::string& s) {
+  bool empty = true;
+  for (const char c : s) {
+    if (std::isalpha(c)) {
+      return false;
+    }
+    if (!std::isspace(c)) {
+      empty = false;
+    }
+  }
+  return !empty;
+}
+
+std::string toLower(const std::string& s) {
+  std::string t;
+  t.resize(s.size());
+  for (size_t i = 0; i < t.size(); i++) {
+    t[i] = std::tolower(s[i]);
+  }
+  return t;
+}
+
+TensorProto_DataType toTensorProtoDataType(const std::string& in) {
+  std::string s = toLower(in);
+  if (s == "uint8") {
+    return TensorProto_DataType_UINT8;
+  } else if (s == "int8") {
+    return TensorProto_DataType_INT8;
+  } else if (s == "uint16") {
+    return TensorProto_DataType_UINT16;
+  } else if (s == "int16") {
+    return TensorProto_DataType_INT16;
+  } else if (s == "int32") {
+    return TensorProto_DataType_INT32;
+  } else if (s == "int64") {
+    return TensorProto_DataType_INT64;
+  } else if (s == "float16" || s == "half") {
+    return TensorProto_DataType_FLOAT16;
+  } else if (s == "float") {
+    return TensorProto_DataType_FLOAT;
+  } else if (s == "double") {
+    return TensorProto_DataType_DOUBLE;
+  } else if (s == "byte") {
+    return TensorProto_DataType_BYTE;
+  } else if (s == "string") {
+    return TensorProto_DataType_STRING;
+  } else if (s == "bool") {
+    return TensorProto_DataType_BOOL;
+  } else if (s == "hash") {
+    return TensorProto_DataType_ZERO_COLLISION_HASH;
+  }
+  // return default data type, float
+  return TensorProto_DataType_FLOAT;
+}
+} // namespace
+
 ShapeInfo getShapeInfoFromBlob(const Blob* blob) {
   ShapeInfo shape_info;
   shape_info.shape = GetTensorShapeOfBlob(blob);
@@ -138,14 +195,24 @@ void parseShapeInfoMapFromString(
     const auto& name = kv[0];
 
     TensorShape shape;
-    if (name.find("int8") != std::string::npos) {
-      shape.set_data_type(TensorProto_DataType_UINT8);
+    size_t size = kv.size();
+    CAFFE_ENFORCE_GT(size, 1);
+    if (!isNumber(kv[size - 1])) {
+      // last value is the type
+      shape.set_data_type(toTensorProtoDataType(kv[size - 1]));
+      size--;
     } else {
-      shape.set_data_type(TensorProto_DataType_FLOAT);
+      if (name.find("int8") != std::string::npos) {
+        // Kept for backwards compatibility.
+        // Set type explicitly to overwrite it.
+        shape.set_data_type(TensorProto_DataType_UINT8);
+      } else {
+        shape.set_data_type(TensorProto_DataType_FLOAT);
+      }
     }
 
     bool valid = true;
-    for (int i = 1; i < kv.size(); i++) {
+    for (int i = 1; i < size; i++) {
       auto dim = kv[i];
       try {
         shape.add_dims(std::stoi(dim));
diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
index 528bbee3c2ca..35b9605021e6 100644
--- a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
+++ b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
@@ -6,6 +6,10 @@
 
 #include "common.h"
 
+#ifdef USE_FBGEMM
+#include "fbgemm/QuantUtils.h"
+#endif
+
 namespace caffe2 {
 
 void FloatToFused8BitRowwiseQuantized__base(
@@ -58,46 +62,32 @@ void Fused8BitRowwiseQuantizedToFloat__base(
   }
 }
 
-decltype(FloatToFused8BitRowwiseQuantized__base)
-    FloatToFused8BitRowwiseQuantized__avx2_fma;
 void FloatToFused8BitRowwiseQuantized(
     const float* input,
     int input_rows,
     int input_columns,
     std::uint8_t* output) {
-  AVX2_FMA_DO(
-      FloatToFused8BitRowwiseQuantized,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FloatToFused8BitRowwiseQuantized,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat(
+      input, input_rows, input_columns, output);
+#else
+  FloatToFused8BitRowwiseQuantized__base(
+      input, input_rows, input_columns, output);
+#endif
 }
 
-decltype(Fused8BitRowwiseQuantizedToFloat__base)
-    Fused8BitRowwiseQuantizedToFloat__avx2_fma;
 void Fused8BitRowwiseQuantizedToFloat(
     const std::uint8_t* input,
     int input_rows,
     int input_columns,
     float* output) {
-  AVX2_FMA_DO(
-      Fused8BitRowwiseQuantizedToFloat,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      Fused8BitRowwiseQuantizedToFloat,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat(
+      input, input_rows, input_columns, output);
+#else
+  Fused8BitRowwiseQuantizedToFloat__base(
+      input, input_rows, input_columns, output);
+#endif
 }
 
 void FloatToFusedNBitRowwiseQuantizedSBHalf__base(
@@ -184,52 +174,34 @@ void FusedNBitRowwiseQuantizedSBHalfToFloat__base(
   }
 }
 
-decltype(FloatToFusedNBitRowwiseQuantizedSBHalf__base)
-    FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma;
 void FloatToFusedNBitRowwiseQuantizedSBHalf(
     int bit_rate,
     const float* input,
     int input_rows,
     int input_columns,
     std::uint8_t* output) {
-  AVX2_FMA_DO(
-      FloatToFusedNBitRowwiseQuantizedSBHalf,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FloatToFusedNBitRowwiseQuantizedSBHalf,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf(
+      bit_rate, input, input_rows, input_columns, output);
+#else
+  FloatToFusedNBitRowwiseQuantizedSBHalf__base(
+      bit_rate, input, input_rows, input_columns, output);
+#endif
 }
 
-decltype(FusedNBitRowwiseQuantizedSBHalfToFloat__base)
-    FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma;
 void FusedNBitRowwiseQuantizedSBHalfToFloat(
     int bit_rate,
     const std::uint8_t* input,
     int input_rows,
     int input_columns,
     float* output) {
-  AVX2_FMA_DO(
-      FusedNBitRowwiseQuantizedSBHalfToFloat,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FusedNBitRowwiseQuantizedSBHalfToFloat,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat(
+      bit_rate, input, input_rows, input_columns, output);
+#else
+  FusedNBitRowwiseQuantizedSBHalfToFloat__base(
+      bit_rate, input, input_rows, input_columns, output);
+#endif
 }
 
 } // namespace caffe2
diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc
deleted file mode 100644
index e7053b5136c0..000000000000
--- a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc
+++ /dev/null
@@ -1,534 +0,0 @@
-#include "./fused_nbit_rowwise_conversion.h"
-
-#include <immintrin.h>
-#include <algorithm>
-#include <cfloat> // for FLT_MAX
-#include <cmath>
-
-#include "./cvtsh_ss_bugfix.h"
-
-namespace caffe2 {
-
-constexpr int VLEN = 8;
-
-void FloatToFused8BitRowwiseQuantized__avx2_fma(
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  constexpr float kEpsilon = 1e-8f;
-
-  __m256i permute_mask1_v =
-      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
-  __m256i shuffle_mask_v = _mm256_set_epi8(
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0x0c,
-      0x08,
-      0x04,
-      0x00,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0x0c,
-      0x08,
-      0x04,
-      0x00);
-  __m256i permute_mask2_v =
-      _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
-
-  int output_columns = input_columns + 2 * sizeof(float);
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const float* input_row = input + row * input_columns;
-    std::uint8_t* output_row = output + row * output_columns;
-    float* output_row_scale_bias =
-        reinterpret_cast<float*>(output_row + input_columns);
-
-    float minimum_element = FLT_MAX;
-    float maximum_element = -FLT_MAX;
-    __m256 min_v = _mm256_set1_ps(minimum_element);
-    __m256 max_v = _mm256_set1_ps(maximum_element);
-    std::size_t col;
-    for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_loadu_ps(input_row + col);
-      min_v = _mm256_min_ps(min_v, in_v);
-      max_v = _mm256_max_ps(max_v, in_v);
-    }
-    alignas(64) float min_buf[VLEN], max_buf[VLEN];
-    _mm256_store_ps(min_buf, min_v);
-    _mm256_store_ps(max_buf, max_v);
-    for (int i = 0; i < VLEN; ++i) {
-      minimum_element = std::min(minimum_element, min_buf[i]);
-      maximum_element = std::max(maximum_element, max_buf[i]);
-    }
-    for (; col < input_columns; ++col) {
-      minimum_element = std::min(minimum_element, input_row[col]);
-      maximum_element = std::max(maximum_element, input_row[col]);
-    }
-
-    float range = maximum_element - minimum_element;
-
-    output_row_scale_bias[0] = range / 255.0f;
-    output_row_scale_bias[1] = minimum_element;
-    const auto inverse_scale = 255.0f / (range + kEpsilon);
-    min_v = _mm256_set1_ps(minimum_element);
-    __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
-
-    for (col = 0; col < input_columns / (4 * VLEN) * (4 * VLEN);
-         col += 4 * VLEN) {
-      __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-          inverse_scale_v));
-      __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v),
-          inverse_scale_v));
-      __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v),
-          inverse_scale_v));
-      __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v),
-          inverse_scale_v));
-
-      // An instruction sequence to save 32 32-bit integers as 8-bit integers
-      __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
-      __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
-      __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v);
-      xyzw_packed_v =
-          _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v);
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i*>(output_row + col), xyzw_packed_v);
-    }
-    for (; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256i rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-          inverse_scale_v));
-
-      // An instruction sequence to save 8 32-bit integers as 8-bit integers
-      rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
-      rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask2_v);
-      _mm_storel_epi64(
-          reinterpret_cast<__m128i*>(output_row + col),
-          _mm256_castsi256_si128(rounded_v));
-    }
-    for (; col < input_columns; ++col) {
-      output_row[col] =
-          std::lrintf((input_row[col] - minimum_element) * inverse_scale);
-    }
-  }
-}
-
-void Fused8BitRowwiseQuantizedToFloat__avx2_fma(
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  int output_columns = input_columns - 2 * sizeof(float);
-
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const std::uint8_t* input_row = input + row * input_columns;
-    const float* input_row_scale_bias =
-        reinterpret_cast<const float*>(input_row + output_columns);
-    float* output_row = output + row * output_columns;
-
-    __m256 scale_v = _mm256_set1_ps(input_row_scale_bias[0]);
-    __m256 bias_v = _mm256_set1_ps(input_row_scale_bias[1]);
-
-    std::size_t col;
-    for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(input_row + col))));
-      _mm256_storeu_ps(
-          output_row + col,
-          _mm256_add_ps(_mm256_mul_ps(in_v, scale_v), bias_v));
-    }
-
-    for (; col < output_columns; ++col) {
-      output_row[col] =
-          input_row[col] * input_row_scale_bias[0] + input_row_scale_bias[1];
-    }
-  }
-}
-
-namespace {
-
-template <int BIT_RATE>
-void FloatToFusedNBitRowwiseQuantizedSBHalf_(
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  __m256i permute_mask1_v =
-      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
-
-  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
-  int output_columns =
-      (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE +
-      2 * sizeof(std::uint16_t);
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const float* input_row = input + row * input_columns;
-    std::uint8_t* output_row = output + row * output_columns;
-    std::uint16_t* output_row_scale_bias = reinterpret_cast<std::uint16_t*>(
-        output_row +
-        (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-
-    float minimum_element = FLT_MAX;
-    float maximum_element = -FLT_MAX;
-    __m256 min_v = _mm256_set1_ps(minimum_element);
-    __m256 max_v = _mm256_set1_ps(maximum_element);
-    std::size_t col;
-    for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_loadu_ps(input_row + col);
-      min_v = _mm256_min_ps(min_v, in_v);
-      max_v = _mm256_max_ps(max_v, in_v);
-    }
-    alignas(64) float min_buf[VLEN], max_buf[VLEN];
-    _mm256_store_ps(min_buf, min_v);
-    _mm256_store_ps(max_buf, max_v);
-    for (int i = 0; i < VLEN; ++i) {
-      minimum_element = std::min(minimum_element, min_buf[i]);
-      maximum_element = std::max(maximum_element, max_buf[i]);
-    }
-    for (; col < input_columns; ++col) {
-      minimum_element = std::min(minimum_element, input_row[col]);
-      maximum_element = std::max(maximum_element, input_row[col]);
-    }
-
-    output_row_scale_bias[1] = _cvtss_sh(
-        minimum_element, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    minimum_element = _cvtsh_ss(output_row_scale_bias[1]);
-    const float range = maximum_element - minimum_element;
-
-    float scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1);
-    std::uint16_t scale_fp16 =
-        _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    scale = _cvtsh_ss(scale_fp16);
-    if (scale == 0) {
-      // Corner case handling when maximum_element == minimum_element
-      // Any scale would work because maximum_element - minimum_element will be
-      // 0 for all X
-      scale = 1.0f;
-    }
-    float inverse_scale = 1.0f / scale;
-    if (std::isinf(inverse_scale)) {
-      scale = 1.0f;
-      inverse_scale = 1.0f;
-    }
-
-    output_row_scale_bias[0] =
-        _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-
-    __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
-    min_v = _mm256_set1_ps(minimum_element);
-
-    col = 0;
-
-    if (BIT_RATE == 2 || BIT_RATE == 4) {
-      for (; col + 4 * VLEN <= input_columns; col += 4 * VLEN) {
-        __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-            inverse_scale_v));
-        __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v),
-            inverse_scale_v));
-        __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v),
-            inverse_scale_v));
-        __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v),
-            inverse_scale_v));
-
-        // An instruction sequence to save 32 32-bit integers as 8-bit integers
-        __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
-        __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
-        __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v);
-        xyzw_packed_v =
-            _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v);
-
-        // saturate to BIT_RATE
-        xyzw_packed_v = _mm256_min_epu8(
-            xyzw_packed_v,
-            _mm256_set1_epi8(static_cast<char>((1 << BIT_RATE) - 1)));
-
-        if (BIT_RATE == 4) {
-          // pack into lower 8-bit of each 16-bit
-          xyzw_packed_v = _mm256_and_si256(
-              _mm256_or_si256(
-                  xyzw_packed_v, _mm256_srli_epi16(xyzw_packed_v, 4)),
-              _mm256_set1_epi16(0x00ff));
-        } else {
-          // pack into lower 8-bit of each 32-bit
-          xyzw_packed_v = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      xyzw_packed_v, _mm256_srli_epi32(xyzw_packed_v, 6)),
-                  _mm256_or_si256(
-                      _mm256_srli_epi32(xyzw_packed_v, 8 + 4),
-                      _mm256_srli_epi32(xyzw_packed_v, 2 * 8 + 2))),
-              _mm256_set1_epi32(0x00ff));
-        }
-
-        __m128i out_v;
-        if (BIT_RATE == 4) {
-          // avx2 doesn't have _mm256_cvtepi16_epi8
-          out_v = _mm_packus_epi16(
-              _mm256_castsi256_si128(xyzw_packed_v),
-              _mm256_extractf128_si256(xyzw_packed_v, 1));
-          _mm_storeu_si128(
-              reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE),
-              out_v);
-        } else {
-          // avx2 doesn't have _mm256_cvtepi32_epi8
-          out_v = _mm_packus_epi32(
-              _mm256_castsi256_si128(xyzw_packed_v),
-              _mm256_extractf128_si256(xyzw_packed_v, 1));
-          out_v = _mm_packus_epi16(out_v, out_v);
-          _mm_storel_epi64(
-              reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE),
-              out_v);
-        }
-      }
-    }
-
-    for (; col < input_columns; ++col) {
-      float X = input_row[col];
-      std::uint8_t quantized = std::max(
-          0,
-          std::min<int>(
-              std::lrintf((X - minimum_element) * inverse_scale),
-              (1 << BIT_RATE) - 1));
-      if (col % NUM_ELEM_PER_BYTE == 0) {
-        output_row[col / NUM_ELEM_PER_BYTE] = quantized;
-      } else {
-        output_row[col / NUM_ELEM_PER_BYTE] |=
-            (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE));
-      }
-    }
-  }
-}
-
-template <int BIT_RATE>
-void FusedNBitRowwiseQuantizedSBHalfToFloat_(
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  constexpr int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
-  int output_columns =
-      (input_columns - 2 * sizeof(std::uint16_t)) * NUM_ELEM_PER_BYTE;
-
-  // mask can be accessed by avx2_ps_or_epi32_combined_mask[(8 - remainder) % 8]
-  static const int avx2_ps_or_epi32_combined_mask[16] = {
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-  };
-
-  // Compute a remainder for vector load
-  // Since every row is followed by 2 fp16 (scale and bias), luckily
-  // we don't need mask at bit-rate granularity but just at 32-bit
-  // granularity.
-  constexpr int NUM_ELEM_PER_32BIT = 32 / BIT_RATE;
-  // multiply by 4 because we're handling 4 vlen per iteration
-  constexpr int NUM_OF_32BIT_PER_VLOAD = VLEN * 4 / NUM_ELEM_PER_32BIT;
-  int remainder_32bit_granularity = (output_columns + NUM_ELEM_PER_32BIT - 1) /
-      NUM_ELEM_PER_32BIT % NUM_OF_32BIT_PER_VLOAD;
-  __m128i vmask_load = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(
-      avx2_ps_or_epi32_combined_mask + NUM_OF_32BIT_PER_VLOAD +
-      (NUM_OF_32BIT_PER_VLOAD - remainder_32bit_granularity) %
-          NUM_OF_32BIT_PER_VLOAD));
-  int remainder = output_columns % (4 * VLEN);
-  __m256i vmask_store0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN - std::min(output_columns % (4 * VLEN), VLEN) % (VLEN + 1))));
-  __m256i vmask_store1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - VLEN, VLEN)) %
-           (VLEN + 1))));
-  __m256i vmask_store2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - 2 * VLEN, VLEN)) %
-           (VLEN + 1))));
-  __m256i vmask_store3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - 3 * VLEN, VLEN)) %
-           (VLEN + 1))));
-
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const std::uint8_t* input_row = input + row * input_columns;
-    const std::uint16_t* input_row_scale_bias =
-        reinterpret_cast<const std::uint16_t*>(
-            input_row +
-            (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-    float scale = _cvtsh_ss(input_row_scale_bias[0]);
-    float bias = _cvtsh_ss(input_row_scale_bias[1]);
-    float* output_row = output + row * output_columns;
-
-    std::size_t col = 0;
-    if (BIT_RATE == 4 || BIT_RATE == 2) {
-      __m256 vscale = _mm256_set1_ps(scale);
-      __m256 vbias = _mm256_set1_ps(bias);
-      for (; col + 4 * VLEN <= output_columns; col += 4 * VLEN) {
-        __m256i vinq;
-        // unpack to 8-bit integers
-        if (BIT_RATE == 4) {
-          vinq = _mm256_cvtepu8_epi16(
-              _mm_loadu_si128(reinterpret_cast<const __m128i*>(
-                  input_row + col / NUM_ELEM_PER_BYTE)));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)),
-              _mm256_set1_epi16(0x0f0f));
-        } else {
-          vinq = _mm256_cvtepu8_epi32(
-              _mm_loadl_epi64(reinterpret_cast<const __m128i*>(
-                  input_row + col / NUM_ELEM_PER_BYTE)));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      _mm256_slli_epi32(vinq, 2 * 8 + 2),
-                      _mm256_slli_epi32(vinq, 8 + 4)),
-                  _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)),
-              _mm256_set1_epi32(0x03030303));
-        }
-        __m256 vinq0 = _mm256_cvtepi32_ps(
-            _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq)));
-        __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1))));
-        __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2))));
-        __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3))));
-        vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias);
-        vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias);
-        vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias);
-        vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias);
-        _mm256_storeu_ps(output_row + col, vinq0);
-        _mm256_storeu_ps(output_row + col + VLEN, vinq1);
-        _mm256_storeu_ps(output_row + col + 2 * VLEN, vinq2);
-        _mm256_storeu_ps(output_row + col + 3 * VLEN, vinq3);
-      }
-
-      if (remainder) {
-        __m256i vinq;
-        if (BIT_RATE == 4) {
-          vinq = _mm256_cvtepu8_epi16(_mm_maskload_epi32(
-              reinterpret_cast<const int*>(input_row + col / NUM_ELEM_PER_BYTE),
-              vmask_load));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)),
-              _mm256_set1_epi16(0x0f0f));
-        } else {
-          vinq = _mm256_cvtepu8_epi32(_mm_maskload_epi32(
-              reinterpret_cast<const int*>(input_row + col / NUM_ELEM_PER_BYTE),
-              vmask_load));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      _mm256_slli_epi32(vinq, 2 * 8 + 2),
-                      _mm256_slli_epi32(vinq, 8 + 4)),
-                  _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)),
-              _mm256_set1_epi32(0x03030303));
-        }
-
-        __m256 vinq0 = _mm256_cvtepi32_ps(
-            _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq)));
-        __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1))));
-        __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2))));
-        __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3))));
-
-        vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias);
-        vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias);
-        vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias);
-        vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias);
-
-        _mm256_maskstore_ps(output_row + col, vmask_store0, vinq0);
-        _mm256_maskstore_ps(output_row + col + VLEN, vmask_store1, vinq1);
-        _mm256_maskstore_ps(output_row + col + 2 * VLEN, vmask_store2, vinq2);
-        _mm256_maskstore_ps(output_row + col + 3 * VLEN, vmask_store3, vinq3);
-      }
-    } else {
-      for (; col < output_columns; ++col) {
-        std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE];
-        quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE;
-        quantized &= (1 << BIT_RATE) - 1;
-        output_row[col] = scale * quantized + bias;
-      }
-    }
-  }
-}
-} // namespace
-
-void FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma(
-    int bit_rate,
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  if (bit_rate == 2) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<2>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 4) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<4>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 8) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<8>(
-        input, input_rows, input_columns, output);
-  }
-}
-
-void FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma(
-    int bit_rate,
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  if (bit_rate == 2) {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<2>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 4) {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<4>(
-        input, input_rows, input_columns, output);
-  } else {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<8>(
-        input, input_rows, input_columns, output);
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index f79b7c8e7d9c..75b0c8b583be 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import sys
diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi
new file mode 100644
index 000000000000..060f60fc6c88
--- /dev/null
+++ b/caffe2/proto/caffe2_pb2.pyi
@@ -0,0 +1,18 @@
+
+# Defined in caffe2/proto/caffe2_pb2.h
+class DeviceType:
+    ...
+
+CPU: DeviceType = ...
+CUDA: DeviceType = ...
+OPENGL: DeviceType = ...
+OPENCL: DeviceType = ...
+MKLDNN: DeviceType = ...
+IDEEP: DeviceType = ...
+HIP: DeviceType = ...
+
+class NetDef:
+    ...
+
+class OperatorDef:
+    ...
\ No newline at end of file
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 09b5652e61f2..8582eff9ce19 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 from caffe2.proto import caffe2_pb2
 import os
 import sys
diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py
index 663cc9e02864..22038715f289 100644
--- a/caffe2/python/allcompare_test.py
+++ b/caffe2/python/allcompare_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py
index 73be94feaf2b..59f4a5adb6a5 100644
--- a/caffe2/python/attention.py
+++ b/caffe2/python/attention.py
@@ -1,9 +1,9 @@
 ## @package attention
 # Module caffe2.python.attention
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew
 
diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py
index 8393ca7875aa..84d0d46490b0 100644
--- a/caffe2/python/benchmark_generator.py
+++ b/caffe2/python/benchmark_generator.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import string
 
 import argparse
diff --git a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
index 9b9a196e9770..ce96dbc1dd63 100644
--- a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
+++ b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 
diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
index bdba35545255..1b683be0d51e 100644
--- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
+++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import datetime
diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py
index 802d61025e30..39dba40df8a0 100644
--- a/caffe2/python/binarysize.py
+++ b/caffe2/python/binarysize.py
@@ -15,10 +15,10 @@
 green, assuming that you have a xterm connection that supports color.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import subprocess
 import sys
diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
index 2722c21d84d0..0e050ec32c44 100644
--- a/caffe2/python/brew.py
+++ b/caffe2/python/brew.py
@@ -1,9 +1,9 @@
 ## @package model_helper_api
 # Module caffe2.python.model_helper_api
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import sys
 import copy
diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py
index 8b3d08977c2c..4973876a8008 100644
--- a/caffe2/python/brew_test.py
+++ b/caffe2/python/brew_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, core, scope, workspace
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/build.py b/caffe2/python/build.py
index 0f447265d5f4..862c031004c5 100644
--- a/caffe2/python/build.py
+++ b/caffe2/python/build.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py
index 1dd179c71caf..980c4fe40e08 100644
--- a/caffe2/python/cached_reader.py
+++ b/caffe2/python/cached_reader.py
@@ -1,9 +1,9 @@
 ## @package cached_reader
 # Module caffe2.python.cached_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index cdd96eb1f492..9d7797fc3ada 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -1,9 +1,9 @@
 ## @package checkpoint
 # Module caffe2.python.checkpoint
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import logging
diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
index a91bbf9910e2..90746747dd98 100644
--- a/caffe2/python/checkpoint_test.py
+++ b/caffe2/python/checkpoint_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import Struct, ConstRecord
 from caffe2.python import core, workspace, model_helper
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index aead1d599474..a0fd52e1fdbc 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -1,9 +1,9 @@
 ## @package cnn
 # Module caffe2.python.cnn
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, workspace
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
index 928807ba2805..28815bb7f36b 100644
--- a/caffe2/python/context.py
+++ b/caffe2/python/context.py
@@ -1,9 +1,9 @@
 ## @package context
 # Module caffe2.python.context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import threading
 import six
diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py
index 6a1f77f5ecf8..6c259d326a19 100644
--- a/caffe2/python/context_test.py
+++ b/caffe2/python/context_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context, test_util
 from threading import Thread
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
index dd332f745f9a..6b0654d6f26e 100644
--- a/caffe2/python/control.py
+++ b/caffe2/python/control.py
@@ -11,10 +11,10 @@
   If
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from future.utils import viewitems
diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py
index 5a8d24cf55d8..a0e85f4d0bc1 100644
--- a/caffe2/python/control_ops_grad.py
+++ b/caffe2/python/control_ops_grad.py
@@ -1,9 +1,9 @@
 ## @package control_ops_grad
 # Module caffe2.python.control_ops_grad
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/control_ops_grad_test.py b/caffe2/python/control_ops_grad_test.py
index a84b9ca0a168..f637e38a5e33 100644
--- a/caffe2/python/control_ops_grad_test.py
+++ b/caffe2/python/control_ops_grad_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import core, test_util, workspace
diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py
index 76ab14a7bc65..cfff82de318b 100644
--- a/caffe2/python/control_ops_util.py
+++ b/caffe2/python/control_ops_util.py
@@ -1,9 +1,9 @@
 ## @package control_ops_util
 # Module caffe2.python.control_ops_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
index e51aeffa8b04..3f9df172d2b7 100644
--- a/caffe2/python/control_test.py
+++ b/caffe2/python/control_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import control, core, test_util, workspace
 
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 44f81d6e2d13..18033661a69e 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -1,9 +1,9 @@
 ## @package workspace
 # Module caffe2.python.workspace
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2, torch_pb2
 
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index 82c969c901ea..a1dc52aad2d9 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import convert, workspace
 from caffe2.proto import caffe2_pb2, torch_pb2
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 3b493277a182..6d7c503e2c81 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -1,9 +1,9 @@
 ## @package core
 # Module caffe2.python.core
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple, OrderedDict, defaultdict
 from past.builtins import basestring
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index 8b229029f5f7..3674b7aa4585 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from future.utils import bytes_to_native_str
 from hypothesis import given, settings
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 8660f5cc2106..b0f5b11f0d1c 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from inspect import currentframe, getframeinfo
 import unittest
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
index a009f8f0fa31..703ae604c654 100644
--- a/caffe2/python/crf.py
+++ b/caffe2/python/crf.py
@@ -1,6 +1,6 @@
 ## @package crf
 # Module caffe2.python.crf
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import brew, core, model_helper, recurrent
diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py
index dd1c8720bfb1..9bc0372c50c0 100644
--- a/caffe2/python/crf_predict.py
+++ b/caffe2/python/crf_predict.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python.crf import CRFWithLoss
diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py
index 970a7c6d4a8f..052bbbf4e6bf 100644
--- a/caffe2/python/crf_viterbi_test.py
+++ b/caffe2/python/crf_viterbi_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace, crf
 
 from caffe2.python.cnn import CNNModelHelper
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 7f5527472cc2..95abb7159d42 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -1,8 +1,8 @@
 ## @package data_parallel_model
 # Module caffe2.python.data_parallel_model
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from collections import OrderedDict
 from future.utils import viewitems, viewkeys, viewvalues
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index e106dee97039..a0dbb3037c2c 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from future.utils import viewkeys
 from multiprocessing import Process, Queue
diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py
index eb49da78c0af..698a8953ef13 100644
--- a/caffe2/python/data_workers.py
+++ b/caffe2/python/data_workers.py
@@ -1,9 +1,9 @@
 ## @package data_workers
 # Module caffe2.python.data_workers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 '''
diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py
index 1abd8dfa28d7..4669aaf59476 100644
--- a/caffe2/python/data_workers_test.py
+++ b/caffe2/python/data_workers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index 5221262582ee..ff6e9c6860f6 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -15,10 +15,10 @@
 
 See `dataset.py` for an example of implementation.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.schema import Field, Struct, from_blob_list
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
index 26f1c0902f71..0c45fb50aed9 100644
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.dataio import (
     CompositeReader,
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
index 387dbbaead58..4c2d4c806476 100644
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@@ -10,10 +10,10 @@
 is stored as a set of native Caffe2 tensors, thus no type conversion or
 deserialization is necessary.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py
index 9296f1c6b7db..265b19251717 100644
--- a/caffe2/python/db_file_reader.py
+++ b/caffe2/python/db_file_reader.py
@@ -1,9 +1,9 @@
 ## @package db_file_reader
 # Module caffe2.python.db_file_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope, workspace, _import_c_extension as C
 from caffe2.python.dataio import Reader
diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py
index f642202b36f0..f0f5d2770dc0 100644
--- a/caffe2/python/db_test.py
+++ b/caffe2/python/db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace
 
diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py
index 0a16420f6d5a..904f1731e960 100644
--- a/caffe2/python/docs/formatter.py
+++ b/caffe2/python/docs/formatter.py
@@ -1,9 +1,9 @@
 ## @package formatter
 # Module caffe2.python.docs.formatter
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python.docs.parser import Parser
 
 
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
index 1bc41b7d1ccb..c5a7df369bc2 100644
--- a/caffe2/python/docs/generator.py
+++ b/caffe2/python/docs/generator.py
@@ -1,9 +1,9 @@
 ## @package generator
 # Module caffe2.python.docs.generator
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 from caffe2.python import core, workspace
diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py
index 5cb1fdcf5d7b..3fd78507346e 100644
--- a/caffe2/python/docs/github.py
+++ b/caffe2/python/docs/github.py
@@ -1,9 +1,9 @@
 ## @package github
 # Module caffe2.python.docs.github
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 from caffe2.python.docs.formatter import Markdown
diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py
index 024989c97e25..a4edb6e07246 100644
--- a/caffe2/python/docs/parser.py
+++ b/caffe2/python/docs/parser.py
@@ -1,9 +1,9 @@
 ## @package parser
 # Module caffe2.python.docs.parser
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import re
 
 
diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py
index 8bea14423875..0382cc3a8212 100644
--- a/caffe2/python/dyndep.py
+++ b/caffe2/python/dyndep.py
@@ -1,9 +1,9 @@
 ## @package dyndep
 # Module caffe2.python.dyndep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import ctypes
 import os
diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py
index a4d66036b93d..33dbf757dda4 100644
--- a/caffe2/python/embedding_generation_benchmark.py
+++ b/caffe2/python/embedding_generation_benchmark.py
@@ -1,9 +1,9 @@
 ## @package embedding_generation_benchmark
 # Module caffe2.python.embedding_generation_benchmark
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, utils, model_helper
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index fb2059f94868..59e85431e8bf 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -1,9 +1,9 @@
 ## @package char_rnn
 # Module caffe2.python.examples.char_rnn
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, model_helper, utils, brew
 from caffe2.python.rnn_cell import LSTM
diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py
index b29b3b806001..af56069a7be0 100644
--- a/caffe2/python/examples/lmdb_create_example.py
+++ b/caffe2/python/examples/lmdb_create_example.py
@@ -1,9 +1,9 @@
 ## @package lmdb_create_example
 # Module caffe2.python.examples.lmdb_create_example
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import numpy as np
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
index cbe9491d9cf6..822a0a2950ba 100644
--- a/caffe2/python/experiment_util.py
+++ b/caffe2/python/experiment_util.py
@@ -1,9 +1,9 @@
 ## @package experiment_util
 # Module caffe2.python.experiment_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import datetime
 import time
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
index c533ae6d77bc..06c6707dcce9 100644
--- a/caffe2/python/extension_loader.py
+++ b/caffe2/python/extension_loader.py
@@ -1,9 +1,9 @@
 ## @package extension_loader
 # Module caffe2.python.extension_loader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import contextlib
 import ctypes
 import sys
diff --git a/caffe2/python/fakefp16_transform_lib.py b/caffe2/python/fakefp16_transform_lib.py
index 885f15732055..c3f142061479 100644
--- a/caffe2/python/fakefp16_transform_lib.py
+++ b/caffe2/python/fakefp16_transform_lib.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
 
 import caffe2.python._import_c_extension as C
 from caffe2.proto.caffe2_pb2 import NetDef
diff --git a/caffe2/python/fakelowp/init_shared_libs.py b/caffe2/python/fakelowp/init_shared_libs.py
index d289c7c4a97d..2a98de4571aa 100644
--- a/caffe2/python/fakelowp/init_shared_libs.py
+++ b/caffe2/python/fakelowp/init_shared_libs.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import ctypes
 import os
diff --git a/caffe2/python/fakelowp/test_utils.py b/caffe2/python/fakelowp/test_utils.py
index 75e4422f3ccc..4a31a92e5bce 100644
--- a/caffe2/python/fakelowp/test_utils.py
+++ b/caffe2/python/fakelowp/test_utils.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import sys
 import numpy as np
diff --git a/caffe2/python/filler_test.py b/caffe2/python/filler_test.py
index 52ea756d5bea..9aff384e99af 100644
--- a/caffe2/python/filler_test.py
+++ b/caffe2/python/filler_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, test_util, workspace
 
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
index 7c26f69a0c43..d32acb3d8a90 100644
--- a/caffe2/python/functional.py
+++ b/caffe2/python/functional.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
index e7803e829bb4..d90943761aa4 100644
--- a/caffe2/python/functional_test.py
+++ b/caffe2/python/functional_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
index d2ecf118ea27..a7e5d714b63c 100644
--- a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
+++ b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index 1b492229a433..3f8dd83b5538 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -2,10 +2,10 @@
 # can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS
 # FILE.
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import (
     brew,
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index b1cdcc2bbb56..afb8d5071492 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -1,9 +1,9 @@
 ## @package gradient_checker
 # Module caffe2.python.gradient_checker
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
index e6caa2cae1eb..049a9152878a 100644
--- a/caffe2/python/gru_cell.py
+++ b/caffe2/python/gru_cell.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 from caffe2.python import brew, rnn_cell
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
index 6bc3779a4ca1..948c55ac88ce 100644
--- a/caffe2/python/helpers/algebra.py
+++ b/caffe2/python/helpers/algebra.py
@@ -1,9 +1,9 @@
 ## @package algebra
 # Module caffe2.python.helpers.algebra
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs):
diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py
index ac6978be8064..a112e9b84c5d 100644
--- a/caffe2/python/helpers/arg_scope.py
+++ b/caffe2/python/helpers/arg_scope.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 import contextlib
 import copy
 import threading
diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py
index 3f8955331d4e..fae0011bf1f6 100644
--- a/caffe2/python/helpers/array_helpers.py
+++ b/caffe2/python/helpers/array_helpers.py
@@ -1,9 +1,9 @@
 ## @package arra_helpers
 # Module caffe2.python.helpers.array_helpers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def concat(model, blobs_in, blob_out, **kwargs):
diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py
index a738a71fe44c..c6f71d0761a5 100644
--- a/caffe2/python/helpers/control_ops.py
+++ b/caffe2/python/helpers/control_ops.py
@@ -1,9 +1,9 @@
 ## @package control_ops
 # Module caffe2.python.helpers.control_ops
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.control_ops_util import add_if_op, add_while_op
 
diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py
index bb88b2e3757f..dfca165084df 100644
--- a/caffe2/python/helpers/conv.py
+++ b/caffe2/python/helpers/conv.py
@@ -1,9 +1,9 @@
 ## @package conv
 # Module caffe2.python.helpers.conv
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling import initializers
diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py
index 6e642a393da4..d5772cb7653e 100644
--- a/caffe2/python/helpers/db_input.py
+++ b/caffe2/python/helpers/db_input.py
@@ -1,9 +1,9 @@
 ## @package db_input
 # Module caffe2.python.helpers.db_input
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 def db_input(model, blobs_out, batch_size, db, db_type):
     dbreader_name = "dbreader_" + db
diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py
index 6fbb5bcda99a..d7280318f60d 100644
--- a/caffe2/python/helpers/dropout.py
+++ b/caffe2/python/helpers/dropout.py
@@ -1,9 +1,9 @@
 ## @package dropout
 # Module caffe2.python.helpers.dropout
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs):
diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py
index 55fbd708489c..ef9184d00dd2 100644
--- a/caffe2/python/helpers/elementwise_linear.py
+++ b/caffe2/python/helpers/elementwise_linear.py
@@ -1,9 +1,9 @@
 ## @package elementwise_linear
 # Module caffe2.python.helpers.elementwise_linear
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py
index 9d61dc7ac145..0feb2b65745e 100644
--- a/caffe2/python/helpers/fc.py
+++ b/caffe2/python/helpers/fc.py
@@ -1,9 +1,9 @@
 ## @package fc
 # Module caffe2.python.helpers.fc
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling import initializers
diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py
index f773cc3114de..3a8be3bb056a 100644
--- a/caffe2/python/helpers/nonlinearity.py
+++ b/caffe2/python/helpers/nonlinearity.py
@@ -1,9 +1,9 @@
 ## @package nonlinearity
 # Module caffe2.python.helpers.nonlinearity
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py
index 621f565b5455..b13b43f6859a 100644
--- a/caffe2/python/helpers/normalization.py
+++ b/caffe2/python/helpers/normalization.py
@@ -1,9 +1,9 @@
 ## @package normalization
 # Module caffe2.python.helpers.normalization
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py
index 412d55434d16..9e6fc784f289 100644
--- a/caffe2/python/helpers/pooling.py
+++ b/caffe2/python/helpers/pooling.py
@@ -2,10 +2,10 @@
 # Module caffe2.python.helpers.pooling
 ## @package fc
 # Module caffe2.python.helpers.pooling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py
index 59defe9e236b..178620eab593 100644
--- a/caffe2/python/helpers/tools.py
+++ b/caffe2/python/helpers/tools.py
@@ -1,9 +1,9 @@
 ## @package tools
 # Module caffe2.python.helpers.tools
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def image_input(
diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py
index bee36347808a..02883af7402d 100644
--- a/caffe2/python/helpers/train.py
+++ b/caffe2/python/helpers/train.py
@@ -1,9 +1,9 @@
 ## @package train
 # Module caffe2.python.helpers.train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/hip_test_util.py b/caffe2/python/hip_test_util.py
index 3910c9e5c2ce..beab3be1c40a 100644
--- a/caffe2/python/hip_test_util.py
+++ b/caffe2/python/hip_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py
index e98056f9cd88..ec465c12240e 100644
--- a/caffe2/python/hsm_util.py
+++ b/caffe2/python/hsm_util.py
@@ -1,9 +1,9 @@
 ## @package hsm_util
 # Module caffe2.python.hsm_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import hsm_pb2
 
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 897be5fab44a..045677f8422a 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 import copy
@@ -10,7 +10,7 @@
 from hypothesis import assume, given, settings, HealthCheck
 import hypothesis.strategies as st
 import unittest
-import os
+import threading
 
 from caffe2.python import core, workspace, tt_core, dyndep
 import caffe2.python.hypothesis_test_util as hu
@@ -2695,6 +2695,60 @@ def histogram(X):
         self.assertDeviceChecks(dc, op, [X], [0, 1])
         self.assertReferenceChecks(gc, op, [X], histogram)
 
+    @settings(max_examples=1, deadline=None)
+    @given(
+        queue_capacity=st.integers(2, 2),
+        time_sleep=st.integers(5, 10),
+        num_blobs_to_equeue=st.integers(1, 1),
+        num_blobs_to_dequeue=st.integers(2, 2),
+    )
+    def test_safe_dequeue_blob__raises_exception_when_hang(
+        self,
+        queue_capacity,
+        time_sleep,
+        num_blobs_to_equeue,
+        num_blobs_to_dequeue,
+    ):
+        r"""
+        Tests SafeDequeueBlobsOp being cancellable.
+
+        Create a queue with the number of BlobsQueue less than the number
+        SafeDequeueBlobs to cause the hanging behavior when running the Net.
+
+        Then call cancel from the previous sleeping thread to ensure exception
+        is raised.
+        """
+
+        def _net_instance_cancel(net_instance):
+            time.sleep(time_sleep)
+            net_instance.cancel()
+
+        init_net = core.Net("init_net")
+        init_net.Proto().type = "async_scheduling"
+
+        queue = init_net.CreateBlobsQueue(
+            [],
+            "queue_name",
+            capacity=queue_capacity,
+            num_blobs=num_blobs_to_equeue,
+        )
+
+        ws = workspace.Workspace()
+        ws.create_net(init_net).run()
+
+        net = core.Net("net")
+        net.Proto().type = "async_scheduling"
+
+        blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue)
+
+        net_instance = ws.create_net(net)
+
+        t = threading.Thread(target=_net_instance_cancel, args=[net_instance])
+        t.start()
+
+        with self.assertRaises(Exception):
+            net_instance.run()
+            t.join()
 
 
 if __name__ == "__main__":
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 797010b46890..2000e269969e 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -34,10 +34,10 @@
   implemented on the CPU.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.proto import caffe2_pb2
 from caffe2.python import (
     workspace, device_checker, gradient_checker, test_util, core)
diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py
index 956f10be8831..23ecd79062f7 100644
--- a/caffe2/python/ideep/LRN_op_test.py
+++ b/caffe2/python/ideep/LRN_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/adam_op_test.py b/caffe2/python/ideep/adam_op_test.py
index a0d9b2ce014f..5ac0395bff63 100644
--- a/caffe2/python/ideep/adam_op_test.py
+++ b/caffe2/python/ideep/adam_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/blobs_queue_db_test.py b/caffe2/python/ideep/blobs_queue_db_test.py
index ded18e89c5ae..966fcc23d47d 100644
--- a/caffe2/python/ideep/blobs_queue_db_test.py
+++ b/caffe2/python/ideep/blobs_queue_db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/channel_shuffle_op_test.py b/caffe2/python/ideep/channel_shuffle_op_test.py
index 8c3eea3d8618..b4cedca61061 100644
--- a/caffe2/python/ideep/channel_shuffle_op_test.py
+++ b/caffe2/python/ideep/channel_shuffle_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py
index c28a7f1fe52c..75c9ceeba0e4 100644
--- a/caffe2/python/ideep/concat_split_op_test.py
+++ b/caffe2/python/ideep/concat_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
index e82d8aec5515..ae4473ea4864 100644
--- a/caffe2/python/ideep/conv_op_test.py
+++ b/caffe2/python/ideep/conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import sys
diff --git a/caffe2/python/ideep/conv_transpose_test.py b/caffe2/python/ideep/conv_transpose_test.py
index be35dbd8a382..eeda2ea43a2d 100644
--- a/caffe2/python/ideep/conv_transpose_test.py
+++ b/caffe2/python/ideep/conv_transpose_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
index f24333745741..18ce574b623b 100644
--- a/caffe2/python/ideep/convfusion_op_test.py
+++ b/caffe2/python/ideep/convfusion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py
index 4b0a15bd999a..668282f2e159 100644
--- a/caffe2/python/ideep/copy_op_test.py
+++ b/caffe2/python/ideep/copy_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
index efecfb501bff..33b0a52a7421 100644
--- a/caffe2/python/ideep/dropout_op_test.py
+++ b/caffe2/python/ideep/dropout_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from hypothesis import given
diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py
index 9daf34088fc0..11a35d6b2b28 100644
--- a/caffe2/python/ideep/elementwise_sum_op_test.py
+++ b/caffe2/python/ideep/elementwise_sum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py
index 4a4fb7319b25..3693a217bb4b 100644
--- a/caffe2/python/ideep/expanddims_squeeze_op_test.py
+++ b/caffe2/python/ideep/expanddims_squeeze_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
index 9e29bfaed919..6549bb6ad6bb 100644
--- a/caffe2/python/ideep/fc_op_test.py
+++ b/caffe2/python/ideep/fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from functools import reduce
diff --git a/caffe2/python/ideep/leaky_relu_op_test.py b/caffe2/python/ideep/leaky_relu_op_test.py
index 8a68d2e608ef..6d84f88f4fe2 100644
--- a/caffe2/python/ideep/leaky_relu_op_test.py
+++ b/caffe2/python/ideep/leaky_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py
index 06d0e9be0e57..596bab0ad3cc 100644
--- a/caffe2/python/ideep/moment_sgd_op_test.py
+++ b/caffe2/python/ideep/moment_sgd_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py
index 6d40a88b5c13..dc928c264082 100644
--- a/caffe2/python/ideep/operator_fallback_op_test.py
+++ b/caffe2/python/ideep/operator_fallback_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py
index 8a967dcf9c08..a259e01bab10 100644
--- a/caffe2/python/ideep/order_switch_op_test.py
+++ b/caffe2/python/ideep/order_switch_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py
index 9659d3961338..9ab3fcddbadb 100644
--- a/caffe2/python/ideep/pool_op_test.py
+++ b/caffe2/python/ideep/pool_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/pre_convert_test.py b/caffe2/python/ideep/pre_convert_test.py
index a32eedd74469..6c0b7ca5d7a7 100644
--- a/caffe2/python/ideep/pre_convert_test.py
+++ b/caffe2/python/ideep/pre_convert_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py
index bd05c69381c5..e2fda68aed2b 100644
--- a/caffe2/python/ideep/relu_op_test.py
+++ b/caffe2/python/ideep/relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/reshape_op_test.py b/caffe2/python/ideep/reshape_op_test.py
index c9714f6eb4a5..c2bca948a52c 100644
--- a/caffe2/python/ideep/reshape_op_test.py
+++ b/caffe2/python/ideep/reshape_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.test_util import TestCase
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py
index e1ab30c12e45..47114832f85d 100644
--- a/caffe2/python/ideep/shape_op_test.py
+++ b/caffe2/python/ideep/shape_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/sigmoid_op_test.py b/caffe2/python/ideep/sigmoid_op_test.py
index b67932108084..2b5eb0e3a2b5 100644
--- a/caffe2/python/ideep/sigmoid_op_test.py
+++ b/caffe2/python/ideep/sigmoid_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py
index 9043061514a0..b76d6509609b 100644
--- a/caffe2/python/ideep/softmax_op_test.py
+++ b/caffe2/python/ideep/softmax_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
index 25b83e2447fc..618a0e7fbfc3 100644
--- a/caffe2/python/ideep/spatial_bn_op_test.py
+++ b/caffe2/python/ideep/spatial_bn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
index b0483cf4c4b6..aa1c5bc260fa 100644
--- a/caffe2/python/ideep/test_ideep_net.py
+++ b/caffe2/python/ideep/test_ideep_net.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
index 6345b76735a7..962d4051718b 100644
--- a/caffe2/python/ideep/transform_ideep_net.py
+++ b/caffe2/python/ideep/transform_ideep_net.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import copy
diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py
index b02085a3ba3b..8b324ed964ae 100644
--- a/caffe2/python/ideep/transpose_op_test.py
+++ b/caffe2/python/ideep/transpose_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py
index 2a0b3ec3e7b0..b1e46fca4851 100644
--- a/caffe2/python/ideep/weightedsum_op_test.py
+++ b/caffe2/python/ideep/weightedsum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
index e131ee027c35..7129ed14ba74 100644
--- a/caffe2/python/ideep_test_util.py
+++ b/caffe2/python/ideep_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 90e5a4d76b6d..7c3dda3b320c 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -1,9 +1,9 @@
 # @package layer_model_helper
 # Module caffe2.python.layer_model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, model_helper, schema, scope, utils, muji
 from caffe2.python.modeling.parameter_info import (
diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py
index 9ceb1310bf30..9284b9b9e687 100644
--- a/caffe2/python/layer_model_instantiator.py
+++ b/caffe2/python/layer_model_instantiator.py
@@ -1,9 +1,9 @@
 ## @package layer_model_instantiator
 # Module caffe2.python.layer_model_instantiator
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import InstantiationContext
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
index 5d87dbd7522a..518412b9e90c 100644
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 from caffe2.python.modeling.parameter_sharing import (
diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py
index 2f2e23062e34..ae28e82b98cc 100644
--- a/caffe2/python/layer_test_util.py
+++ b/caffe2/python/layer_test_util.py
@@ -1,9 +1,9 @@
 ## @package layer_test_util
 # Module caffe2.python.layer_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple
 
diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py
index 2a09dc8419a6..487b7751fd08 100644
--- a/caffe2/python/layers/__init__.py
+++ b/caffe2/python/layers/__init__.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from importlib import import_module
 import pkgutil
diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py
index c081e8573038..146a0bdb1974 100644
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@@ -1,6 +1,6 @@
 # @package adaptive_weight
 # Module caffe2.fb.python.layers.adaptive_weight
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import core, schema
diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py
index 0ffa46afb2b3..1a0fd8b295f3 100644
--- a/caffe2/python/layers/add_bias.py
+++ b/caffe2/python/layers/add_bias.py
@@ -1,9 +1,9 @@
 ## @package add_bias
 # Module caffe2.python.layers.add_bias
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py
index 2409eca551a1..89c5014f5c5c 100644
--- a/caffe2/python/layers/arc_cosine_feature_map.py
+++ b/caffe2/python/layers/arc_cosine_feature_map.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py
index 48b6ebcf8f58..0a5323625419 100644
--- a/caffe2/python/layers/batch_huber_loss.py
+++ b/caffe2/python/layers/batch_huber_loss.py
@@ -1,9 +1,9 @@
 # @package batch_huber_loss
 # Module caffe2.python.layers.batch_huber_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
index a560a3f654a9..46b0e4d42cdf 100644
--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_lr_loss
 # Module caffe2.python.layers.batch_lr_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py
index 89da74f3c1e9..b0dd63ab09c8 100644
--- a/caffe2/python/layers/batch_mse_loss.py
+++ b/caffe2/python/layers/batch_mse_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_mse_loss
 # Module caffe2.python.layers.batch_mse_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
index 9fe3ee51eb56..6395b09ff67f 100644
--- a/caffe2/python/layers/batch_normalization.py
+++ b/caffe2/python/layers/batch_normalization.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
index 9ef8cf563dbe..84e7d4873f50 100644
--- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
+++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_sigmoid_cross_entropy_loss
 # Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py
index d5f9413ef96a..30667a04c159 100644
--- a/caffe2/python/layers/batch_softmax_loss.py
+++ b/caffe2/python/layers/batch_softmax_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_softmax_loss
 # Module caffe2.python.layers.batch_softmax_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py
index cf8ecfd99045..a37fab463581 100644
--- a/caffe2/python/layers/blob_weighted_sum.py
+++ b/caffe2/python/layers/blob_weighted_sum.py
@@ -1,9 +1,9 @@
 ## @package BlobWeightedSum
 # Module caffe2.python.layers.blob_weighted_sum
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py
index 4e6a60fdaa57..389de8c241e8 100644
--- a/caffe2/python/layers/bpr_loss.py
+++ b/caffe2/python/layers/bpr_loss.py
@@ -1,9 +1,9 @@
 ## @package bpr_loss
 # Module caffe2.python.layers.bpr_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py
index 3c750e7b136f..2c200a922fdd 100644
--- a/caffe2/python/layers/bucket_weighted.py
+++ b/caffe2/python/layers/bucket_weighted.py
@@ -1,9 +1,9 @@
 ## @package bucket_weighted
 # Module caffe2.python.layers.bucket_weighted
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py
index b8c999bc256e..29c63f3d8948 100644
--- a/caffe2/python/layers/build_index.py
+++ b/caffe2/python/layers/build_index.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py
index 062485757edc..fb1dc6ab6dbf 100644
--- a/caffe2/python/layers/concat.py
+++ b/caffe2/python/layers/concat.py
@@ -1,9 +1,9 @@
 ## @package concat
 # Module caffe2.python.layers.concat
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py
index 06e9d9cd9b66..d160ed8206b3 100644
--- a/caffe2/python/layers/constant_weight.py
+++ b/caffe2/python/layers/constant_weight.py
@@ -1,9 +1,9 @@
 # @package constant_weight
 # Module caffe2.fb.python.layers.constant_weight
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py
index bb22acf0cafa..e98bac7e2d80 100644
--- a/caffe2/python/layers/conv.py
+++ b/caffe2/python/layers/conv.py
@@ -1,9 +1,9 @@
 ## @package conv
 # Module caffe2.python.layers.conv
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py
index a5d3f01a440e..4bc0cf2785b2 100644
--- a/caffe2/python/layers/dropout.py
+++ b/caffe2/python/layers/dropout.py
@@ -1,8 +1,8 @@
 # Module caffe2.python.layers.dropout
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py
index a9eeceff2c21..9220f22165a3 100644
--- a/caffe2/python/layers/fc.py
+++ b/caffe2/python/layers/fc.py
@@ -1,9 +1,9 @@
 ## @package fc
 # Module caffe2.python.layers.fc
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.helpers.arg_scope import get_current_scope
 from caffe2.python import schema
diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py
index 6a48f572ddba..b3c2eb346f96 100644
--- a/caffe2/python/layers/fc_with_bootstrap.py
+++ b/caffe2/python/layers/fc_with_bootstrap.py
@@ -1,6 +1,6 @@
 ## @package fc_with_bootstrap
 # Module caffe2.python.layers.fc_with_bootstrap
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import math
 
diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py
index e8923a8e5b9c..2899af618b79 100644
--- a/caffe2/python/layers/fc_without_bias.py
+++ b/caffe2/python/layers/fc_without_bias.py
@@ -1,9 +1,9 @@
 ## @package fc_without_bias
 # Module caffe2.python.layers.fc_without_bias
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
index 69fe91a48691..ca004d136ded 100644
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -1,6 +1,6 @@
 # @package sparse_to_dense
 # Module caffe2.python.layers.sparse_to_dense
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from collections import defaultdict
 
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
index 53d5c050242f..c6d156fd68ce 100644
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@@ -1,9 +1,9 @@
 # @package functional
 # Module caffe2.python.layers.functional
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema, scope, workspace
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py
index 1289c097902c..da468d5db90c 100644
--- a/caffe2/python/layers/gather_record.py
+++ b/caffe2/python/layers/gather_record.py
@@ -1,9 +1,9 @@
 ## @package gather_record
 # Module caffe2.python.layers.gather_record
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py
index 63da1f04abf4..4c24223cbc8d 100644
--- a/caffe2/python/layers/homotopy_weight.py
+++ b/caffe2/python/layers/homotopy_weight.py
@@ -1,10 +1,10 @@
 # @package homotopy_weight
 # Module caffe2.fb.python.layers.homotopy_weight
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py
index e2282e051611..7e4987270660 100644
--- a/caffe2/python/layers/label_smooth.py
+++ b/caffe2/python/layers/label_smooth.py
@@ -15,10 +15,10 @@
 
 # @package label_smooth
 # Module caffe2.python.layers.label_smooth
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
index fb93effbff2d..a16b731a2f78 100644
--- a/caffe2/python/layers/last_n_window_collector.py
+++ b/caffe2/python/layers/last_n_window_collector.py
@@ -1,9 +1,9 @@
 ## @package last_n_window_collector
 # Module caffe2.python.layers.last_n_window_collector
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
index 0dc6795994cb..580a03bfc5da 100644
--- a/caffe2/python/layers/layer_normalization.py
+++ b/caffe2/python/layers/layer_normalization.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py
index 216d0b2e3286..abcdd1596220 100644
--- a/caffe2/python/layers/layers.py
+++ b/caffe2/python/layers/layers.py
@@ -1,6 +1,6 @@
 ## @package layers
 # Module caffe2.python.layers.layers
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import logging
 from collections import namedtuple
diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py
index 15267752caa3..6f97ade23ef4 100644
--- a/caffe2/python/layers/margin_rank_loss.py
+++ b/caffe2/python/layers/margin_rank_loss.py
@@ -1,9 +1,9 @@
 ## @package random_neg_rank_loss
 # Module caffe2.python.layers.random_neg_rank_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema, core
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
index 117dd7904787..68c27b587567 100644
--- a/caffe2/python/layers/merge_id_lists.py
+++ b/caffe2/python/layers/merge_id_lists.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py
index 30cb6ace2b81..5020e5432c2a 100644
--- a/caffe2/python/layers/pairwise_similarity.py
+++ b/caffe2/python/layers/pairwise_similarity.py
@@ -1,9 +1,9 @@
 ## @package dot_product
 # Module caffe2.python.layers.dot_product
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py
index 19ddda2b6dcf..12e26bcd774e 100644
--- a/caffe2/python/layers/position_weighted.py
+++ b/caffe2/python/layers/position_weighted.py
@@ -1,9 +1,9 @@
 ## @package position_weighted
 # Module caffe2.python.layers.position_weighted
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
index 6056da4ba7cf..bde05ab97147 100644
--- a/caffe2/python/layers/random_fourier_features.py
+++ b/caffe2/python/layers/random_fourier_features.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py
index 3819a1971da4..21b9c44f2a79 100644
--- a/caffe2/python/layers/reservoir_sampling.py
+++ b/caffe2/python/layers/reservoir_sampling.py
@@ -1,9 +1,9 @@
 ## @package reservoir_sampling
 # Module caffe2.python.layers.reservoir_sampling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py
index 1c617326da7f..034c897e2c2f 100644
--- a/caffe2/python/layers/sampling_train.py
+++ b/caffe2/python/layers/sampling_train.py
@@ -1,9 +1,9 @@
 ## @package sampling_train
 # Module caffe2.python.layers.sampling_train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer, get_layer_class
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
index 911fd8391e3f..403cc5a4a51c 100644
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@@ -1,9 +1,9 @@
 ## @package sampling_trainable_mixin
 # Module caffe2.python.layers.sampling_trainable_mixin
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import abc
 import six
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
index 65e44bece97c..49e42ca308d7 100644
--- a/caffe2/python/layers/select_record_by_context.py
+++ b/caffe2/python/layers/select_record_by_context.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 
diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py
index d7b96d956d08..58f30ac71f19 100644
--- a/caffe2/python/layers/semi_random_features.py
+++ b/caffe2/python/layers/semi_random_features.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap
diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py
index 8275d83d8734..3e03888e57dc 100644
--- a/caffe2/python/layers/sparse_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_dropout_with_replacement.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py
index 3927b199fbdf..c3ada99dc4a7 100644
--- a/caffe2/python/layers/sparse_feature_hash.py
+++ b/caffe2/python/layers/sparse_feature_hash.py
@@ -1,9 +1,9 @@
 ## @package sparse_feature_hash
 # Module caffe2.python.layers.sparse_feature_hash
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema, core
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index 30cb60266c4d..dd1c42606063 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -1,9 +1,9 @@
 ## @package sparse_lookup
 # Module caffe2.python.layers.sparse_lookup
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.optimizer import FP16_ENGINES, Optimizer
 from caffe2.python.helpers.arg_scope import get_current_scope
diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py
index a83881f5a091..58e569a272c7 100644
--- a/caffe2/python/layers/split.py
+++ b/caffe2/python/layers/split.py
@@ -1,9 +1,9 @@
 ## @package split
 # Module caffe2.python.layers.split
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
index 28b7312dbcaa..5161ee2e1a96 100644
--- a/caffe2/python/layers/tags.py
+++ b/caffe2/python/layers/tags.py
@@ -1,9 +1,9 @@
 ## @package tags
 # Module caffe2.python.layers.tags
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import six
 
diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py
index 46ed29bbaa41..5581371d008d 100644
--- a/caffe2/python/layers/uniform_sampling.py
+++ b/caffe2/python/layers/uniform_sampling.py
@@ -1,9 +1,9 @@
 ## @package uniform_sampling
 # Module caffe2.python.layers.uniform_sampling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
index 4d037a891ade..e084a011d357 100644
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/lazy_dyndep.py b/caffe2/python/lazy_dyndep.py
index e1799838f4b2..e53d4fda350b 100644
--- a/caffe2/python/lazy_dyndep.py
+++ b/caffe2/python/lazy_dyndep.py
@@ -1,9 +1,9 @@
 ## @package lazy_dyndep
 # Module caffe2.python.lazy_dyndep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 from caffe2.python import dyndep, lazy
diff --git a/caffe2/python/lazy_dyndep_test.py b/caffe2/python/lazy_dyndep_test.py
index 881215ac36e3..1441facd3a6f 100644
--- a/caffe2/python/lazy_dyndep_test.py
+++ b/caffe2/python/lazy_dyndep_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
index f08e9147d3ba..718b7fb3a987 100644
--- a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
+++ b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
index d73db5aaa36c..a38d442dd952 100644
--- a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
+++ b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py
index cfa53a81610c..29f819ec622e 100644
--- a/caffe2/python/lstm_benchmark.py
+++ b/caffe2/python/lstm_benchmark.py
@@ -1,9 +1,9 @@
 ## @package lstm_benchmark
 # Module caffe2.python.lstm_benchmark
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, utils, rnn_cell, model_helper
diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py
index c299c817ace4..a728fc4e2157 100644
--- a/caffe2/python/memonger.py
+++ b/caffe2/python/memonger.py
@@ -1,9 +1,9 @@
 ## @package memonger
 # Module caffe2.python.memonger
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import networkx as nx
 import collections
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index 7d5c52224b1c..8584e8d5e4cc 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
index 73df4820a5d1..2b084bea591b 100644
--- a/caffe2/python/mkl/mkl_LRN_op_test.py
+++ b/caffe2/python/mkl/mkl_LRN_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
index 35eae62d5be1..ae42902d9102 100644
--- a/caffe2/python/mkl/mkl_LRN_speed_test.py
+++ b/caffe2/python/mkl/mkl_LRN_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py
index a1a96ca755d9..8b01f8885b1c 100644
--- a/caffe2/python/mkl/mkl_concat_op_test.py
+++ b/caffe2/python/mkl/mkl_concat_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
index 38ceb680bb6d..f1fe7b062318 100644
--- a/caffe2/python/mkl/mkl_conv_op_test.py
+++ b/caffe2/python/mkl/mkl_conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py
index 633865cd5047..b2baeb9ef1af 100644
--- a/caffe2/python/mkl/mkl_copy_op_test.py
+++ b/caffe2/python/mkl/mkl_copy_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
index eab454ffe105..0709b5afd9f6 100644
--- a/caffe2/python/mkl/mkl_elementwise_add_op_test.py
+++ b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
index 71e0754a0214..3adec4848e50 100644
--- a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
+++ b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
index 01e8c9b5a925..01786d55c337 100644
--- a/caffe2/python/mkl/mkl_fc_op_test.py
+++ b/caffe2/python/mkl/mkl_fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
index 7cabadfe1da0..85f5605e9676 100644
--- a/caffe2/python/mkl/mkl_fc_speed_test.py
+++ b/caffe2/python/mkl/mkl_fc_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
index dbdf12c1aca4..26a9b7131b0b 100644
--- a/caffe2/python/mkl/mkl_fill_op_test.py
+++ b/caffe2/python/mkl/mkl_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py
index b733edaace1c..a56e9448317a 100644
--- a/caffe2/python/mkl/mkl_pool_op_test.py
+++ b/caffe2/python/mkl/mkl_pool_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
index a0fa8ca6ece8..b25e0f915cc7 100644
--- a/caffe2/python/mkl/mkl_pool_speed_test.py
+++ b/caffe2/python/mkl/mkl_pool_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py
index 90e365da554b..76ec33bcbe91 100644
--- a/caffe2/python/mkl/mkl_relu_op_test.py
+++ b/caffe2/python/mkl/mkl_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
index 4a5fad2b7b68..2ac9080ce670 100644
--- a/caffe2/python/mkl/mkl_sbn_op_test.py
+++ b/caffe2/python/mkl/mkl_sbn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
index d37bef32b9b7..3b3b71d1c997 100644
--- a/caffe2/python/mkl/mkl_sbn_speed_test.py
+++ b/caffe2/python/mkl/mkl_sbn_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py
index 654008c67b7d..abdb0983778d 100644
--- a/caffe2/python/mkl/mkl_sigmoid_op_test.py
+++ b/caffe2/python/mkl/mkl_sigmoid_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
index 4034705580d5..9a7310a484d1 100644
--- a/caffe2/python/mkl/mkl_speed_test.py
+++ b/caffe2/python/mkl/mkl_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py
index 1e4b5791b0b6..8af090f60d88 100644
--- a/caffe2/python/mkl/mkl_squeeze_op_test.py
+++ b/caffe2/python/mkl/mkl_squeeze_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index c003e0e3b09b..3a88a3deeccc 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import copy
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py
index 42e3269fc4d8..1ad209cdbdfd 100644
--- a/caffe2/python/mkl/rewrite_graph_test.py
+++ b/caffe2/python/mkl/rewrite_graph_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py
index 5d8f66500190..88fb3cc800ec 100644
--- a/caffe2/python/mkl_test_util.py
+++ b/caffe2/python/mkl_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index a26bf844f2de..a5a4865c0ec1 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -1,9 +1,9 @@
 ## @package model_helper
 # Module caffe2.python.model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope, workspace
 from caffe2.python.helpers.db_input import db_input
diff --git a/caffe2/python/model_helper_test.py b/caffe2/python/model_helper_test.py
index fcccddf401db..1423e4a97733 100644
--- a/caffe2/python/model_helper_test.py
+++ b/caffe2/python/model_helper_test.py
@@ -1,6 +1,6 @@
 """unittest for ModelHelper class"""
 
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py
index 3b5ea4b64cba..ea83f96f7019 100644
--- a/caffe2/python/modeling/compute_histogram_for_blobs.py
+++ b/caffe2/python/modeling/compute_histogram_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
index 6c3b59950898..4ce6bf11487a 100644
--- a/caffe2/python/modeling/compute_histogram_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py
index 41b7f88d24eb..24ed7a7482c7 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema, muji
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py
index 3fefce0c4420..1bf3dae0353f 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py
index 9a3fbcc96954..588b4a827cb8 100644
--- a/caffe2/python/modeling/compute_statistics_for_blobs.py
+++ b/caffe2/python/modeling/compute_statistics_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
index e880f3edacb1..bf75a1f7d149 100644
--- a/caffe2/python/modeling/compute_statistics_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py
index 88daa226c887..061dfe33991b 100644
--- a/caffe2/python/modeling/get_entry_from_blobs.py
+++ b/caffe2/python/modeling/get_entry_from_blobs.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py
index 8f4fbb864be1..3ec146766f30 100644
--- a/caffe2/python/modeling/get_entry_from_blobs_test.py
+++ b/caffe2/python/modeling/get_entry_from_blobs_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py
index 1999ced9ba1b..b01bc2ba301f 100644
--- a/caffe2/python/modeling/gradient_clipping.py
+++ b/caffe2/python/modeling/gradient_clipping.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py
index ca5c2ba8e22b..0b0e962cb727 100644
--- a/caffe2/python/modeling/gradient_clipping_test.py
+++ b/caffe2/python/modeling/gradient_clipping_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
index 2053d9e53976..b3e4b1a44dd7 100644
--- a/caffe2/python/modeling/initializers.py
+++ b/caffe2/python/modeling/initializers.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.core import DataType, BlobReference, ScopedBlobReference
 from caffe2.python.modeling.parameter_info import ParameterInfo
diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py
index 0355d1871787..fad40c159b6e 100644
--- a/caffe2/python/modeling/initializers_test.py
+++ b/caffe2/python/modeling/initializers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import brew, model_helper, workspace
diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py
index 0f0ac7535c88..e824c828e4bd 100644
--- a/caffe2/python/modeling/net_modifier.py
+++ b/caffe2/python/modeling/net_modifier.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import abc
 import six
diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py
index 589aa51a7b1c..195048cf91e8 100644
--- a/caffe2/python/modeling/parameter_info.py
+++ b/caffe2/python/modeling/parameter_info.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
index 77e5cbd3f8bc..a0174500a413 100644
--- a/caffe2/python/modeling/parameter_sharing.py
+++ b/caffe2/python/modeling/parameter_sharing.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 
diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py
index f616fc1ea6ed..d37e40880c02 100644
--- a/caffe2/python/modeling/parameter_sharing_test.py
+++ b/caffe2/python/modeling/parameter_sharing_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, model_helper, scope
 from caffe2.python.modeling.parameter_sharing import (
diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py
index 79f045879ebc..fa10bff7246b 100644
--- a/caffe2/python/models/__sym_init__.py
+++ b/caffe2/python/models/__sym_init__.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import os
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py
index 4b9a570de807..46a9b59f6627 100644
--- a/caffe2/python/models/download.py
+++ b/caffe2/python/models/download.py
@@ -1,9 +1,9 @@
 ## @package download
 # Module caffe2.python.models.download
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 import sys
diff --git a/caffe2/python/models/imagenet_trainer_test_utils.py b/caffe2/python/models/imagenet_trainer_test_utils.py
index 59107336ccd6..fec7708ea150 100644
--- a/caffe2/python/models/imagenet_trainer_test_utils.py
+++ b/caffe2/python/models/imagenet_trainer_test_utils.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import time
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
index 41ca087d9637..430d3d335e1e 100644
--- a/caffe2/python/models/resnet.py
+++ b/caffe2/python/models/resnet.py
@@ -1,9 +1,9 @@
 ## @package resnet
 # Module caffe2.python.models.resnet
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import brew
 import logging
diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py
index ce542e8da046..38d87cefff05 100644
--- a/caffe2/python/models/resnet_test.py
+++ b/caffe2/python/models/resnet_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py
index 7b909697fb05..6fc9f8ece480 100644
--- a/caffe2/python/models/seq2seq/beam_search.py
+++ b/caffe2/python/models/seq2seq/beam_search.py
@@ -1,9 +1,9 @@
 ## @package beam_search
 # Module caffe2.python.models.seq2seq.beam_search
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple
 from caffe2.python import core
diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
index 0ee1f6e35ba0..c10d2f1ab4ed 100644
--- a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
+++ b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import os
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
index b2a50c4bd58b..5adabb86fadf 100644
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
@@ -1,9 +1,9 @@
 ## @package seq2seq_model_helper
 # Module caffe2.python.models.seq2seq.seq2seq_model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
index 8095440f2e5a..b70b74d39dc9 100644
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.models.seq2seq import seq2seq_model_helper
 from caffe2.python import scope, test_util
diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py
index d0702880c1ec..e1b4224ea4c8 100644
--- a/caffe2/python/models/seq2seq/seq2seq_util.py
+++ b/caffe2/python/models/seq2seq/seq2seq_util.py
@@ -2,10 +2,10 @@
 # Module caffe2.python.examples.seq2seq_util
 """ A bunch of util functions to build Seq2Seq models with Caffe2."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import collections
 from future.utils import viewitems
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
index df68e3e30d7b..8080318da4d0 100644
--- a/caffe2/python/models/seq2seq/train.py
+++ b/caffe2/python/models/seq2seq/train.py
@@ -1,9 +1,9 @@
 ## @package train
 # Module caffe2.python.models.seq2seq.train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import collections
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
index d2b6a4f6399f..7e77f623e553 100644
--- a/caffe2/python/models/seq2seq/translate.py
+++ b/caffe2/python/models/seq2seq/translate.py
@@ -1,9 +1,9 @@
 ## @package translate
 # Module caffe2.python.models.seq2seq.translate
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import ABCMeta, abstractmethod
 import argparse
diff --git a/caffe2/python/models/shufflenet.py b/caffe2/python/models/shufflenet.py
index c9075a4a1295..33a7f7a4b7c5 100644
--- a/caffe2/python/models/shufflenet.py
+++ b/caffe2/python/models/shufflenet.py
@@ -1,9 +1,9 @@
 # Module caffe2.python.models.shufflenet
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew
 
diff --git a/caffe2/python/models/shufflenet_test.py b/caffe2/python/models/shufflenet_test.py
index 344c720b3eb6..6ccfd0a83354 100644
--- a/caffe2/python/models/shufflenet_test.py
+++ b/caffe2/python/models/shufflenet_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py
index 008e651e41f7..b65d97587549 100644
--- a/caffe2/python/modifier_context.py
+++ b/caffe2/python/modifier_context.py
@@ -1,9 +1,9 @@
 # @package modifier_context
 # Module caffe2.python.modifier_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 DEFAULT_MODIFIER = 'DEFAULT'
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
index f1af8c3eb521..70dcdec11a58 100644
--- a/caffe2/python/net_builder.py
+++ b/caffe2/python/net_builder.py
@@ -1,9 +1,9 @@
 ## @package net_builder
 # Module caffe2.python.net_builder
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, context
 from caffe2.python.task import Task, TaskGroup
diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py
index 169419c5c17b..bef6caefac3d 100644
--- a/caffe2/python/net_builder_test.py
+++ b/caffe2/python/net_builder_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace
 from caffe2.python.core import Plan, to_execution_step, Net
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
index 1fd0833a718f..b55699c1c095 100644
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@@ -1,9 +1,9 @@
 ## @package net_drawer
 # Module caffe2.python.net_drawer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import json
 import logging
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 09cde6f76767..8e1d65c01ce7 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -1,9 +1,9 @@
 ## @package net_printer
 # Module caffe2.python.net_printer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
 from caffe2.python.checkpoint import Job
diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py
index bc086c3eee2a..e71a2b323dea 100644
--- a/caffe2/python/net_printer_test.py
+++ b/caffe2/python/net_printer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import net_printer
 from caffe2.python.checkpoint import Job
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index c2f1774c7b2b..2b83e0ec9358 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import errno
 import os
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 6ff47c6d4c9a..3d9adc696486 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, test_util
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/nomnigraph_transformations.py b/caffe2/python/nomnigraph_transformations.py
index f4bc2c68bbb6..570c743df152 100644
--- a/caffe2/python/nomnigraph_transformations.py
+++ b/caffe2/python/nomnigraph_transformations.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from collections import defaultdict
 
diff --git a/caffe2/python/nomnigraph_transformations_test.py b/caffe2/python/nomnigraph_transformations_test.py
index 6c58691db277..adbfe1a4885a 100644
--- a/caffe2/python/nomnigraph_transformations_test.py
+++ b/caffe2/python/nomnigraph_transformations_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python import test_util as tu
diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py
index 1d452c6cbe60..2ca147328c78 100644
--- a/caffe2/python/normalizer.py
+++ b/caffe2/python/normalizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.normalizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 
 class Normalizer(object):
diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py
index 57c1052103dc..a85b993b4502 100644
--- a/caffe2/python/normalizer_context.py
+++ b/caffe2/python/normalizer_context.py
@@ -1,9 +1,9 @@
 # @package regularizer_context
 # Module caffe2.python.normalizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py
index 1f4cb4896778..f0ce5099ea75 100644
--- a/caffe2/python/normalizer_test.py
+++ b/caffe2/python/normalizer_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext
 from caffe2.python.normalizer import BatchNormalizer
diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py
index 21c1cb158da1..a840c6932123 100644
--- a/caffe2/python/numa_benchmark.py
+++ b/caffe2/python/numa_benchmark.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 692f515abe87..aba6e420ed55 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py
index 684514d17268..cc3ca1718a5c 100644
--- a/caffe2/python/observer_test.py
+++ b/caffe2/python/observer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 9fe7b23bb7ae..d0f768e42eeb 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -5,10 +5,10 @@
 
 To run this, you will need to have Caffe2 installed as well.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import collections
diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py
index 27135b35763d..4a75068cfd03 100644
--- a/caffe2/python/onnx/backend_cpp_rep.py
+++ b/caffe2/python/onnx/backend_cpp_rep.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.backend_rep_cpp
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from onnx.backend.base import BackendRep, namedtupledict
 
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 13feea3ac8c9..ab97fd562dc1 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -1,9 +1,9 @@
 # @package onnx
 # Module caffe2.python.onnx.backend_rep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
index a30ebdfc3f54..126eef8a8470 100644
--- a/caffe2/python/onnx/bin/conversion.py
+++ b/caffe2/python/onnx/bin/conversion.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.bin.conversion
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import json
 
diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py
index da72af2cc9b1..1bac8290464d 100644
--- a/caffe2/python/onnx/error.py
+++ b/caffe2/python/onnx/error.py
@@ -1,8 +1,8 @@
 ## @package onnx
 # Module caffe2.python.onnx.error
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 class BaseException(Exception): pass
 class Unsupported(BaseException): pass
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index 0fc1c0328093..ee3c30949ff7 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -6,10 +6,10 @@
 To run this, you will need to have Caffe2 installed as well.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import itertools
 import logging
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
index e1d56e1a6766..7f8f1a6d346a 100644
--- a/caffe2/python/onnx/helper.py
+++ b/caffe2/python/onnx/helper.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from onnx.backend.base import namedtupledict
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
index 6bbd35cd434c..a04e7e4554b9 100644
--- a/caffe2/python/onnx/onnxifi.py
+++ b/caffe2/python/onnx/onnxifi.py
@@ -5,10 +5,10 @@
 ONNXIFI a Caffe2 net
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
index a859b572bae6..7eafccaec9e4 100644
--- a/caffe2/python/onnx/test_onnxifi.py
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import json
 import numpy as np
diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py
index e0a02b9d5d83..fd40910d9e70 100644
--- a/caffe2/python/onnx/tests/__init__.py
+++ b/caffe2/python/onnx/tests/__init__.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index d909cf828042..d253b06658a3 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -1,10 +1,10 @@
 # @package onnx
 # Module caffe2.python.onnx.tests.c2_ref_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import json
 import os
diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py
index 8fa128acd62b..86cdddcd1692 100644
--- a/caffe2/python/onnx/tests/conversion_test.py
+++ b/caffe2/python/onnx/tests/conversion_test.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.conversion_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import json
 import six
diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py
index e3682780cb04..9000ad94fd9b 100644
--- a/caffe2/python/onnx/tests/helper_test.py
+++ b/caffe2/python/onnx/tests/helper_test.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.helper_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index ad7885fcda74..e4de0a19c07a 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -1,10 +1,10 @@
 # @package onnx
 # Module caffe2.python.onnx.tests.onnx_backend_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
index 34f849400e30..d34d4a0e5287 100644
--- a/caffe2/python/onnx/tests/ssa_test.py
+++ b/caffe2/python/onnx/tests/ssa_test.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.ssa_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import copy
 import onnx
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
index 1fec938c8e88..d224daf05ba3 100644
--- a/caffe2/python/onnx/tests/test_utils.py
+++ b/caffe2/python/onnx/tests/test_utils.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.test_utils
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import unittest
diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py
index a311ec37dfdc..f03e3609fe8b 100644
--- a/caffe2/python/onnx/workspace.py
+++ b/caffe2/python/onnx/workspace.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.workspace
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import uuid
 
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
index 6e08f920a422..3a1ebcd4ec67 100644
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ b/caffe2/python/operator_fp_exceptions_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 6a7a5ca18ef3..132bee879f6d 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
index 4cb9a54ec664..265d783e6336 100644
--- a/caffe2/python/operator_test/adadelta_test.py
+++ b/caffe2/python/operator_test/adadelta_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
index 5ed2d0287e63..55e2f570cf07 100644
--- a/caffe2/python/operator_test/adagrad_test.py
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py
index 891361e3a879..0fe4aa21f5f9 100644
--- a/caffe2/python/operator_test/adagrad_test_helper.py
+++ b/caffe2/python/operator_test/adagrad_test_helper.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from functools import partial
 
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 0d188abc52be..2fb13c149922 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
index 7e37216b82c1..76b09fdd5cd6 100644
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py
index b7a50ab98e87..1ca26bf64f31 100644
--- a/caffe2/python/operator_test/apmeter_test.py
+++ b/caffe2/python/operator_test/apmeter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
index ce800636e6e6..330d17ed6999 100644
--- a/caffe2/python/operator_test/arg_ops_test.py
+++ b/caffe2/python/operator_test/arg_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py
index e3474c0da7a4..2bbca5ab7376 100644
--- a/caffe2/python/operator_test/assert_test.py
+++ b/caffe2/python/operator_test/assert_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py
index 753e76f15319..88e38df52da5 100644
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py
index 516c066c6ed8..e863289d488c 100644
--- a/caffe2/python/operator_test/basic_rnn_test.py
+++ b/caffe2/python/operator_test/basic_rnn_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core, rnn_cell
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
index 19186220159c..c9306ce1ab07 100644
--- a/caffe2/python/operator_test/batch_box_cox_test.py
+++ b/caffe2/python/operator_test/batch_box_cox_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
index fb13b0c08933..82def0572686 100644
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
index c3ee8750225b..12dd72a4160a 100644
--- a/caffe2/python/operator_test/batch_moments_op_test.py
+++ b/caffe2/python/operator_test/batch_moments_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
index ef59ed23888f..adfc735c66fd 100644
--- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
+++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
index f1ee07c0d1e3..d2584f18af40 100644
--- a/caffe2/python/operator_test/bbox_transform_test.py
+++ b/caffe2/python/operator_test/bbox_transform_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py
index 77faeaeeb608..147a41282505 100644
--- a/caffe2/python/operator_test/bisect_percentile_op_test.py
+++ b/caffe2/python/operator_test/bisect_percentile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
index 6e4c25c77c78..6cf8170b34f8 100644
--- a/caffe2/python/operator_test/blobs_queue_db_test.py
+++ b/caffe2/python/operator_test/blobs_queue_db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 9ccaeaf9e7a7..05b8212242e4 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
index e3bc9f248d3a..8cba2aecf1a4 100644
--- a/caffe2/python/operator_test/boolean_unmask_test.py
+++ b/caffe2/python/operator_test/boolean_unmask_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
index bfbe9b7396fa..3131316feefd 100644
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
index d1cd6ada7f55..bf9af112a5b0 100644
--- a/caffe2/python/operator_test/bucketize_op_test.py
+++ b/caffe2/python/operator_test/bucketize_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, dyndep
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py
index f7ffb5b45b47..bf2a210086e6 100644
--- a/caffe2/python/operator_test/cast_op_test.py
+++ b/caffe2/python/operator_test/cast_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 4e30c915ce2a..e8ee47702445 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
index 7d614047f48d..7adc5ce24fb7 100644
--- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
index d420484bac6b..b821e7b6a43c 100644
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
index cbef433ae0d3..72eedc479dd6 100644
--- a/caffe2/python/operator_test/channel_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_stats_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py
index 7449ab61f32d..3042e5989764 100644
--- a/caffe2/python/operator_test/checkpoint_test.py
+++ b/caffe2/python/operator_test/checkpoint_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, test_util
 import os
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
index c2d9809c8d80..3304121aab08 100644
--- a/caffe2/python/operator_test/clip_op_test.py
+++ b/caffe2/python/operator_test/clip_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
index ee5bd8f73eb3..efc86815bc49 100644
--- a/caffe2/python/operator_test/clip_tensor_op_test.py
+++ b/caffe2/python/operator_test/clip_tensor_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index b5d726d449fc..28e6cd3b3df6 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index bbe0e8eda1c1..1927b4eac78f 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
index 88d8fd8b7a27..2e214f089a45 100644
--- a/caffe2/python/operator_test/conditional_test.py
+++ b/caffe2/python/operator_test/conditional_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py
index ccd78eea4aa3..a240e98fc51e 100644
--- a/caffe2/python/operator_test/conftest.py
+++ b/caffe2/python/operator_test/conftest.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python.serialized_test.serialized_test_util as serial
 
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 3e24e05191ac..ae54cd37a91d 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import collections
 import functools
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
index 6bed93226f5b..4fcb6361d0a6 100644
--- a/caffe2/python/operator_test/conv_transpose_test.py
+++ b/caffe2/python/operator_test/conv_transpose_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
index 4efec570e812..2b8b756cdf61 100644
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
index 9024ee3edfd1..8e914259bb78 100644
--- a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
+++ b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import logging
 
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index 1124df94e67a..04bfbbe6f4f6 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py
index 3ebe26415622..d57ff31508c6 100644
--- a/caffe2/python/operator_test/counter_ops_test.py
+++ b/caffe2/python/operator_test/counter_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
index d9eb89fc3352..b75e7b7b1a10 100644
--- a/caffe2/python/operator_test/crf_test.py
+++ b/caffe2/python/operator_test/crf_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace, crf, brew
 from caffe2.python.model_helper import ModelHelper
 import numpy as np
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index 25dc6791fa12..d1852e7dd9e8 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 21ca68fe007a..1dda7166e65a 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.test_util import caffe2_flaky
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
index 0fd38a82b403..8bc7eb47d488 100644
--- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
index 5de901026eb6..db1b826cfe41 100644
--- a/caffe2/python/operator_test/cudnn_recurrent_test.py
+++ b/caffe2/python/operator_test/cudnn_recurrent_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import model_helper, workspace, core, rnn_cell
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/data_couple_op_test.py b/caffe2/python/operator_test/data_couple_op_test.py
index 32cf21e81bbf..d840207159b2 100644
--- a/caffe2/python/operator_test/data_couple_op_test.py
+++ b/caffe2/python/operator_test/data_couple_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
index 138ac90e68c8..96d93dc5effb 100644
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace, dataset
 from caffe2.python.dataset import Const
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
index 31e407499063..f6ad0e38e73c 100644
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import os
 import unittest
diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
index aea30d890416..8b6f42417fd4 100644
--- a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
+++ b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
index af431f1f07d4..2d6d6429f833 100644
--- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py
+++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/detectron_keypoints.py b/caffe2/python/operator_test/detectron_keypoints.py
index 2f34349beae4..1abff0675993 100644
--- a/caffe2/python/operator_test/detectron_keypoints.py
+++ b/caffe2/python/operator_test/detectron_keypoints.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 try:
     import cv2
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index 753b94d20f1f..e948fdae9673 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index c8c46127e4d9..84c2f7e35f56 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import assume, given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py
index 385e69fded4c..179b42dbabc8 100644
--- a/caffe2/python/operator_test/duplicate_operands_test.py
+++ b/caffe2/python/operator_test/duplicate_operands_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index 8c7df5f33625..ac0dc3dd0975 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
index e35b4a483c6d..3195d969dee5 100644
--- a/caffe2/python/operator_test/elementwise_logical_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index ef9c1b9c8cf3..605c1d741271 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index ca2b847f088c..ed7a09eb0857 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py
index a04e9d0e161d..0f728b723163 100644
--- a/caffe2/python/operator_test/emptysample_ops_test.py
+++ b/caffe2/python/operator_test/emptysample_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
index c8c12e240946..b843bfdc95b9 100644
--- a/caffe2/python/operator_test/enforce_finite_op_test.py
+++ b/caffe2/python/operator_test/enforce_finite_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import numpy as np
diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py
index 8d3c638e1ba1..a89718745b1c 100644
--- a/caffe2/python/operator_test/ensure_clipped_test.py
+++ b/caffe2/python/operator_test/ensure_clipped_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
index 509c28a5a8bb..4812ee3042e0 100644
--- a/caffe2/python/operator_test/ensure_cpu_output_op_test.py
+++ b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import numpy as np
diff --git a/caffe2/python/operator_test/erf_op_test.py b/caffe2/python/operator_test/erf_op_test.py
index 5761c8409bd3..64714db4315c 100644
--- a/caffe2/python/operator_test/erf_op_test.py
+++ b/caffe2/python/operator_test/erf_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import math
 
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index 4be96208fbba..0d198b1aff14 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
index c08596f8717d..1e8b5522053d 100644
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
index 1d64b19b993f..19fa329c9389 100644
--- a/caffe2/python/operator_test/feature_maps_ops_test.py
+++ b/caffe2/python/operator_test/feature_maps_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace, dyndep
 from caffe2.python.test_util import TestCase
 import numpy as np
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
index 4a2d9419d7bc..e080dde3eb5f 100644
--- a/caffe2/python/operator_test/filler_ops_test.py
+++ b/caffe2/python/operator_test/filler_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
index c6d2856c3514..fc25913d8744 100644
--- a/caffe2/python/operator_test/find_op_test.py
+++ b/caffe2/python/operator_test/find_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py
index 19d204e0bded..2e0340c68779 100644
--- a/caffe2/python/operator_test/flatten_op_test.py
+++ b/caffe2/python/operator_test/flatten_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import numpy as np
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
index 9542ecd30691..3e0e5722b0ce 100644
--- a/caffe2/python/operator_test/flexible_top_k_test.py
+++ b/caffe2/python/operator_test/flexible_top_k_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
index 5877cb6da4e8..8c0974bb8579 100644
--- a/caffe2/python/operator_test/floor_op_test.py
+++ b/caffe2/python/operator_test/floor_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
index ecabe7d29ef0..12d0b0265afb 100644
--- a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
+++ b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import math
 import struct
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
index 09225385191a..e9af40a128a6 100644
--- a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
+++ b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index 967131de38d8..fc23be13fdae 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
index 19d538c60556..c0d73af33601 100644
--- a/caffe2/python/operator_test/gather_ranges_op_test.py
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
index 3b1b4bf86515..7dea8f308783 100644
--- a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
+++ b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py
index bcd277cf258b..3d929ce5c0ee 100644
--- a/caffe2/python/operator_test/given_tensor_fill_op_test.py
+++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index f70c0739ded8..f38df09ec9fb 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index 1d46888e791a..62aba236d5ba 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
index d17998c32986..14300beed3f9 100644
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 99da7a3f5626..99444f39ac26 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core, scope, gru_cell
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
index ae8c1dc22799..e683a04d7998 100644
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import torch
diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py
index f2321adc8e01..245bca210ad9 100644
--- a/caffe2/python/operator_test/hsm_test.py
+++ b/caffe2/python/operator_test/hsm_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from hypothesis import given, settings
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index 9fdf0cabb0bd..90a8197e7ccf 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 98e9d61b5bd0..760228382bc6 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py
index 79acc60739f1..0de1f0ad048b 100644
--- a/caffe2/python/operator_test/image_input_op_test.py
+++ b/caffe2/python/operator_test/image_input_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 try:
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
index f7c6d0cdc14a..1eb7ffa20691 100644
--- a/caffe2/python/operator_test/index_hash_ops_test.py
+++ b/caffe2/python/operator_test/index_hash_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py
index 642f340fad80..cf021f59362b 100644
--- a/caffe2/python/operator_test/index_ops_test.py
+++ b/caffe2/python/operator_test/index_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index e57b8a8e11d8..fb4f3c935ba8 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
index 212f807addcf..79d79ae6de21 100644
--- a/caffe2/python/operator_test/integral_image_ops_test.py
+++ b/caffe2/python/operator_test/integral_image_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 51faa14b9029..6ed2db2e88c2 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py
index be38ee38926f..18fddff58d17 100644
--- a/caffe2/python/operator_test/key_split_ops_test.py
+++ b/caffe2/python/operator_test/key_split_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py
index e2f02b29d26f..6f976520e06b 100644
--- a/caffe2/python/operator_test/lars_test.py
+++ b/caffe2/python/operator_test/lars_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 89ba4b2017bd..62e94afe9e7d 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, core, workspace
 from caffe2.python.model_helper import ModelHelper
@@ -373,6 +373,34 @@ def test_layer_norm_brew_wrapper(self, X, gc, dc):
         self.ws.create_net(model.param_init_net).run()
         self.ws.create_net(model.net).run()
 
+    @given(N=st.integers(1, 10), elementwise_affine=st.booleans(), **hu.gcs)
+    @settings(deadline=None)
+    def test_layer_norm_with_empty_batch(self, N, elementwise_affine, gc, dc):
+        X = np.random.randn(0, N).astype(np.float32)
+        gamma = np.random.rand(N).astype(np.float32)
+        beta = np.random.rand(N).astype(np.float32)
+
+        op = core.CreateOperator(
+            "LayerNorm",
+            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
+            ["Y", "mean", "sigma"],
+            elementwise_affine=elementwise_affine,
+        )
+
+        def ref(X, gamma=None, beta=None):
+            Y = np.zeros_like(X)
+            axis = 1
+            mean = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
+            sigma = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
+            return Y, mean, sigma
+
+
+        inputs = [X, gamma, beta] if elementwise_affine else [X]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+        self.assertDeviceChecks(dc, op, inputs, [0, 1])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py
index 2eaa782eeefd..9a888cac7901 100644
--- a/caffe2/python/operator_test/leaky_relu_test.py
+++ b/caffe2/python/operator_test/leaky_relu_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, assume
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
index 3a5d44663771..1891171b80d8 100644
--- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
index 1a1f9eb8c842..8d17c0c7ef08 100644
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
@@ -50,7 +50,7 @@ def ref(iter):
     def test_hill_learning_rate_op(self, gc, dc):
         iter = np.random.randint(low=1, high=1e5, size=1)
 
-        num_iter = int(np.random.randint(low=1e2, high=1e3, size=1))
+        num_iter = int(np.random.randint(low=1e2, high=1e8, size=1))
         start_multiplier = 1e-4
         gamma = 1.0
         power = 0.5
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
index fa3ac0826230..28d7134ac5e8 100644
--- a/caffe2/python/operator_test/length_split_op_test.py
+++ b/caffe2/python/operator_test/length_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index d9cd2b244604..626ec0542b7d 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
index 88c99c3da337..fc4e89e2545b 100644
--- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
+++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index 4a9a6b0ff1a9..e0a5f9609588 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
index 8bc27c31144f..b8b082a02125 100644
--- a/caffe2/python/operator_test/lengths_top_k_ops_test.py
+++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
index 8f4f680de109..c08f1180a920 100644
--- a/caffe2/python/operator_test/listwise_l2r_operator_test.py
+++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index a5e28479cf10..845bafee4702 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import errno
 import hypothesis.strategies as st
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
index cfd49b8a7eb8..6eb3181ea9ad 100644
--- a/caffe2/python/operator_test/locally_connected_op_test.py
+++ b/caffe2/python/operator_test/locally_connected_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, settings, assume
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index e57bdb7a1d41..24cb65ac96f8 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py
index 1fcacc4f26f8..3a58cbe6d960 100644
--- a/caffe2/python/operator_test/lpnorm_op_test.py
+++ b/caffe2/python/operator_test/lpnorm_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py
index add86a3a467e..dcc8b295f7c3 100644
--- a/caffe2/python/operator_test/map_ops_test.py
+++ b/caffe2/python/operator_test/map_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import itertools
 import numpy as np
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
index 354aed27aaf4..e28dd1ce28f8 100644
--- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
+++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
index e18025ffb92d..4849b83648f8 100644
--- a/caffe2/python/operator_test/math_ops_test.py
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index fababb13c54a..b8cef19b24df 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import inspect
 
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index 77c6b82625b1..5830089f8e9b 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
index 9f3302c6e75a..36b765557505 100644
--- a/caffe2/python/operator_test/merge_id_lists_op_test.py
+++ b/caffe2/python/operator_test/merge_id_lists_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py
index b72848b9a422..595debf977fe 100644
--- a/caffe2/python/operator_test/mkl_conv_op_test.py
+++ b/caffe2/python/operator_test/mkl_conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
index 59546d3891e9..2f889d693444 100644
--- a/caffe2/python/operator_test/mkl_packed_fc_op_test.py
+++ b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py
index 92a318f3f10f..914bffd2067c 100644
--- a/caffe2/python/operator_test/mod_op_test.py
+++ b/caffe2/python/operator_test/mod_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy
 
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index ae9d9158f506..3b270df254ce 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
index a37e27141bd0..58f16e87a21c 100644
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
index 0885289c7c1a..bb111a125fc0 100644
--- a/caffe2/python/operator_test/mpi_test.py
+++ b/caffe2/python/operator_test/mpi_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mul_gradient_benchmark.py b/caffe2/python/operator_test/mul_gradient_benchmark.py
index 721676239409..2e11aefcb497 100644
--- a/caffe2/python/operator_test/mul_gradient_benchmark.py
+++ b/caffe2/python/operator_test/mul_gradient_benchmark.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import numpy as np
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
index 14ca954d363f..137be1eece34 100644
--- a/caffe2/python/operator_test/negate_gradient_op_test.py
+++ b/caffe2/python/operator_test/negate_gradient_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py
index 70aad5cab814..3f4e57fa230b 100644
--- a/caffe2/python/operator_test/ngram_ops_test.py
+++ b/caffe2/python/operator_test/ngram_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py
index 46f88a1de079..7a35e0bafa31 100644
--- a/caffe2/python/operator_test/normalize_op_test.py
+++ b/caffe2/python/operator_test/normalize_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index 398b0d4b93ab..a202581f808c 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index d0b7a08ee706..593d5b5aa58c 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 811e38e34af7..4cff53b87d6e 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py
index 3777fdd7695d..7b3f40a27c97 100644
--- a/caffe2/python/operator_test/order_switch_test.py
+++ b/caffe2/python/operator_test/order_switch_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 84f3f46a6dc1..698fbb76df88 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index 6bf2315ca0c5..9a76e6b847a5 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index 43cd10c23188..6d4e6bbdcd08 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
index a5a7db12b1ef..b600c302d83b 100644
--- a/caffe2/python/operator_test/partition_ops_test.py
+++ b/caffe2/python/operator_test/partition_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase, rand_array
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
index 54c42bf63917..d81b0a963185 100644
--- a/caffe2/python/operator_test/percentile_op_test.py
+++ b/caffe2/python/operator_test/percentile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, dyndep
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
index 463380306ce4..d7c4e0df4416 100644
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
index 743cee5cef3c..7ef98249bd79 100644
--- a/caffe2/python/operator_test/pooling_test.py
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py
index 6cf8e7a81b5e..d794ba2162b9 100644
--- a/caffe2/python/operator_test/prepend_dim_test.py
+++ b/caffe2/python/operator_test/prepend_dim_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py
index 7467c8c3900c..b071070151d1 100644
--- a/caffe2/python/operator_test/python_op_test.py
+++ b/caffe2/python/operator_test/python_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.core import CreatePythonOperator
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/quantile_test.py b/caffe2/python/operator_test/quantile_test.py
index 6a4250d06183..39f3728d8e81 100644
--- a/caffe2/python/operator_test/quantile_test.py
+++ b/caffe2/python/operator_test/quantile_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_speed_test.py b/caffe2/python/operator_test/rand_quantization_op_speed_test.py
index ce0e84028541..1c56faff645f 100644
--- a/caffe2/python/operator_test/rand_quantization_op_speed_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_speed_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import time
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py
index 811a20505a3c..e244f77149e1 100644
--- a/caffe2/python/operator_test/rand_quantization_op_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import struct
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
index 94220d76762d..2d52da293127 100644
--- a/caffe2/python/operator_test/rank_loss_operator_test.py
+++ b/caffe2/python/operator_test/rank_loss_operator_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given
diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py
index 930fad30d663..53d3fd4f4ecc 100644
--- a/caffe2/python/operator_test/rebatching_queue_test.py
+++ b/caffe2/python/operator_test/rebatching_queue_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py
index d32b3e794ab4..00e47ed1cb68 100644
--- a/caffe2/python/operator_test/record_queue_test.py
+++ b/caffe2/python/operator_test/record_queue_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.dataset import Dataset
 from caffe2.python.schema import (
diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py
index 24bd0122f4fb..5d9b83604423 100644
--- a/caffe2/python/operator_test/recurrent_net_executor_test.py
+++ b/caffe2/python/operator_test/recurrent_net_executor_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import model_helper, workspace, core, rnn_cell, test_util
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index 7cf79edfafed..13650e6cad4e 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import recurrent, workspace
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index ffb5e8a02667..727631befe89 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
index 018024900281..7d4287df6609 100644
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index 9c57ed4f3090..a42f00bbf82f 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 import six
 from numpy.testing import assert_array_equal
diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py
index 893e09cf6443..cd90656f607d 100644
--- a/caffe2/python/operator_test/resize_op_test.py
+++ b/caffe2/python/operator_test/resize_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py
index 856832c34b99..084d7402df5f 100644
--- a/caffe2/python/operator_test/rmac_regions_op_test.py
+++ b/caffe2/python/operator_test/rmac_regions_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/rms_norm_op_test.py b/caffe2/python/operator_test/rms_norm_op_test.py
index f5a35701877c..797b3c9a01c3 100644
--- a/caffe2/python/operator_test/rms_norm_op_test.py
+++ b/caffe2/python/operator_test/rms_norm_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 64cd7bf48913..8fe037ccb70c 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import (
     core, gradient_checker, rnn_cell, workspace, scope, utils
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index 0487d962e6fb..c74157a039b0 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/rowwise_counter_test.py b/caffe2/python/operator_test/rowwise_counter_test.py
index a00dd24b3f2c..a9dacc5a6d86 100644
--- a/caffe2/python/operator_test/rowwise_counter_test.py
+++ b/caffe2/python/operator_test/rowwise_counter_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/scale_op_test.py b/caffe2/python/operator_test/scale_op_test.py
index 14e17dc2c5d5..b5507e2013fa 100644
--- a/caffe2/python/operator_test/scale_op_test.py
+++ b/caffe2/python/operator_test/scale_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
index 01c415eac953..f991a7dde211 100644
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from functools import partial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py
index fc903f159a4e..4dd2fa1848bf 100644
--- a/caffe2/python/operator_test/selu_op_test.py
+++ b/caffe2/python/operator_test/selu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 720bf9f02030..4609473f91f0 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from functools import partial
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
index aca6ff38a517..702effc226d6 100644
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
index a925783c206e..6e8cae62dbff 100644
--- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
+++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py
index f0f6c22cd10b..3ae26de6b513 100644
--- a/caffe2/python/operator_test/softmax_ops_test.py
+++ b/caffe2/python/operator_test/softmax_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py
index ac28a1a9a51e..dd183b774f92 100644
--- a/caffe2/python/operator_test/softplus_op_test.py
+++ b/caffe2/python/operator_test/softplus_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
index 14d637f50f41..2ba21bb6d44f 100644
--- a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
+++ b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
index 9bdae01d1318..f1f85b1f9bec 100644
--- a/caffe2/python/operator_test/sparse_gradient_checker_test.py
+++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
index 74690c8a2c56..fb958492cfa9 100644
--- a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
+++ b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import datetime
diff --git a/caffe2/python/operator_test/sparse_lp_regularizer_test.py b/caffe2/python/operator_test/sparse_lp_regularizer_test.py
index b0d0b4b5c9b3..7ea32bd69a29 100644
--- a/caffe2/python/operator_test/sparse_lp_regularizer_test.py
+++ b/caffe2/python/operator_test/sparse_lp_regularizer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis
 from hypothesis import given, settings, HealthCheck
diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py
index bd8dbd5f7b53..ecc4ae0c8d22 100644
--- a/caffe2/python/operator_test/sparse_normalize_test.py
+++ b/caffe2/python/operator_test/sparse_normalize_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis
 from hypothesis import given, settings, HealthCheck
diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py
index 1cf243ed05c4..089174007b18 100644
--- a/caffe2/python/operator_test/sparse_ops_test.py
+++ b/caffe2/python/operator_test/sparse_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.test_util import rand_array
diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
index 03deb62d8513..41ec8808bb6a 100644
--- a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
+++ b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 1186161e5f46..35f7bd2a5e29 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, utils, workspace
diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py
index fe768e193c88..4f1842ac4664 100644
--- a/caffe2/python/operator_test/specialized_segment_ops_test.py
+++ b/caffe2/python/operator_test/specialized_segment_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
index 172c6cbafa16..5bd6cb1d08f8 100644
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from functools import partial
diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py
index edc36facb236..6114dfed3b10 100644
--- a/caffe2/python/operator_test/stats_ops_test.py
+++ b/caffe2/python/operator_test/stats_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
index 0a42d5d23728..12a9e6826fd1 100644
--- a/caffe2/python/operator_test/stats_put_ops_test.py
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/storm_test.py b/caffe2/python/operator_test/storm_test.py
index 2ae402a8a290..c97f631d2160 100644
--- a/caffe2/python/operator_test/storm_test.py
+++ b/caffe2/python/operator_test/storm_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index 969e8c7e11e5..eedb57be1d6c 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py
index 41ba814af6ab..8889ddb9f53c 100644
--- a/caffe2/python/operator_test/text_file_reader_test.py
+++ b/caffe2/python/operator_test/text_file_reader_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.text_file_reader import TextFileReader
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py
index 9c103c85c03c..0cd5c0f77895 100644
--- a/caffe2/python/operator_test/thresholded_relu_op_test.py
+++ b/caffe2/python/operator_test/thresholded_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py
index 51471f797b34..d39dfeee0ad7 100644
--- a/caffe2/python/operator_test/tile_op_test.py
+++ b/caffe2/python/operator_test/tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py
index 85cf902812ee..fa628456c3a4 100644
--- a/caffe2/python/operator_test/top_k_test.py
+++ b/caffe2/python/operator_test/top_k_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index a1ddbaa9509e..82ebd2d65f49 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -1,12 +1,12 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+import struct
+import unittest
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
-import struct
 import torch
-import unittest
-
 from caffe2.python import core, workspace
 from hypothesis import given, settings
 from scipy.stats import norm
@@ -77,7 +77,7 @@ def create_bbox_transform_inputs(roi_counts, num_classes, rotated):
 def bytes_to_floats(byte_matrix):
     floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32)
     for i, byte_values in enumerate(byte_matrix):
-        floats[i], = struct.unpack('f', bytearray(byte_values))
+        (floats[i],) = struct.unpack("f", bytearray(byte_values))
     return floats
 
 
@@ -85,7 +85,7 @@ def floats_to_bytes(floats):
     byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8)
     for i, value in enumerate(floats):
         assert isinstance(value, np.float32), (value, floats)
-        as_bytes = struct.pack('f', value)
+        as_bytes = struct.pack("f", value)
         # In Python3 bytes will be a list of int, in Python2 a list of string
         if isinstance(as_bytes[0], int):
             byte_matrix[i] = list(as_bytes)
@@ -268,6 +268,69 @@ def box_with_nms_limit_ref():
         for o, o_ref in zip(outputs, output_refs):
             torch.testing.assert_allclose(o, o_ref)
 
+    @given(
+        dim_1=st.integers(min_value=10, max_value=10),
+        dim_2=st.integers(min_value=3, max_value=3),
+        dim_3=st.integers(min_value=2, max_value=2),
+    )
+    def test_sparse_to_dense_mask(self, dim_1, dim_2, dim_3):
+        indices = np.array([i + 1 for i in range(dim_1)]).astype(np.int32)
+        values = np.random.rand(dim_1, dim_2, dim_3).astype(np.float32)
+        default_value = np.zeros((dim_2, dim_3)).astype(np.float32)
+        mask = [2, 4, 9]
+
+        def sparse_to_dense_mask_ref(return_presence_mask=False):
+            ref_op = core.CreateOperator(
+                "SparseToDenseMask",
+                ["indices", "values", "default_value"],
+                ["output", "presence_mask"],
+                mask=mask,
+                return_presence_mask=return_presence_mask,
+            )
+            workspace.FeedBlob("indices", indices)
+            workspace.FeedBlob("values", values)
+            workspace.FeedBlob("default_value", default_value)
+            workspace.RunOperatorOnce(ref_op)
+
+            if return_presence_mask:
+                return (
+                    workspace.FetchBlob("output"),
+                    workspace.FetchBlob("presence_mask"),
+                )
+
+            return workspace.FetchBlob("output")
+
+        # Testing return_presence_mask = False
+        output = sparse_to_dense_mask_ref()
+        output = torch.tensor(output)
+
+        a, _ = torch.ops._caffe2.SparseToDenseMask(
+            torch.tensor(indices),
+            torch.tensor(values),
+            torch.tensor(default_value),
+            None,
+            mask=mask,
+        )
+
+        torch.testing.assert_allclose(output, a)
+
+        # Testing return_presence_mask = True
+        output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True)
+        output = torch.tensor(output)
+        presence_mask = torch.tensor(presence_mask)
+
+        a, b = torch.ops._caffe2.SparseToDenseMask(
+            torch.tensor(indices),
+            torch.tensor(values),
+            torch.tensor(default_value),
+            None,
+            mask=mask,
+            return_presence_mask=True,
+        )
+
+        torch.testing.assert_allclose(output, a)
+        torch.testing.assert_allclose(presence_mask, b)
+
     @given(
         A=st.integers(min_value=4, max_value=4),
         H=st.integers(min_value=10, max_value=10),
@@ -380,7 +443,7 @@ def inference_lstm_ref():
             return (
                 workspace.FetchBlob("output"),
                 workspace.FetchBlob("hidden"),
-                workspace.FetchBlob("cell")
+                workspace.FetchBlob("cell"),
             )
 
         output, hidden, cell = inference_lstm_ref()
@@ -526,7 +589,7 @@ def rand_rotated_roi():
                     np.random.rand() * H,
                     np.random.rand() * W,
                     np.random.rand() * H,
-                    np.random.rand() * 360 - 180
+                    np.random.rand() * 360 - 180,
                 ]
             ).astype(np.float32)
 
@@ -613,18 +676,19 @@ def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts):
         for x, y in zip(fpn_outputs, all_outputs[1:]):
             torch.testing.assert_allclose(x, y)
 
-    @given(X=hu.tensor(),
-           fast_gelu=st.booleans())
+    @given(X=hu.tensor(), fast_gelu=st.booleans())
     def _test_gelu_op(self, X, fast_gelu, device):
         def _gelu_ref(_X):
-            return (_X * norm.cdf(_X).astype(np.float32), )
-        expected_output, = _gelu_ref(X)
+            return (_X * norm.cdf(_X).astype(np.float32),)
+
+        (expected_output,) = _gelu_ref(X)
         actual_output = torch.ops._caffe2.Gelu(torch.tensor(X), fast_gelu)
 
         rtol = 1e-3 if fast_gelu else 1e-4
         atol = 1e-5
         torch.testing.assert_allclose(
-            expected_output, actual_output.cpu(), rtol=rtol, atol=atol)
+            expected_output, actual_output.cpu(), rtol=rtol, atol=atol
+        )
 
     def test_gelu_op(self):
         self._test_gelu_op(device="cpu")
@@ -633,13 +697,11 @@ def test_gelu_op(self):
     def test_gelu_op_cuda(self):
         self._test_gelu_op(device="cuda")
 
-
-    @given(inputs=hu.lengths_tensor(
-        dtype=np.float32,
-        min_value=1,
-        max_value=5,
-        allow_empty=True,
-    ))
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32, min_value=1, max_value=5, allow_empty=True
+        )
+    )
     def _test_lengths_op(self, inputs, ref_op_name, torch_op, device):
         data, lengths = inputs
 
@@ -652,7 +714,8 @@ def _lengths_ref(X, Y):
 
         expected_output = _lengths_ref(data, lengths)
         actual_output = torch_op(
-            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32))
+            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)
+        )
 
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
@@ -691,8 +754,12 @@ def _test_resize_nearest_op(self, device):
 
         def _resize_nearest_ref(X):
             ref_op = core.CreateOperator(
-                "ResizeNearest", ["X"], ["Y"],
-                width_scale=2.0, height_scale=1.5, order="NCHW",
+                "ResizeNearest",
+                ["X"],
+                ["Y"],
+                width_scale=2.0,
+                height_scale=1.5,
+                order="NCHW",
             )
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
@@ -701,7 +768,9 @@ def _resize_nearest_ref(X):
         expected_output = _resize_nearest_ref(data)
         actual_output = torch.ops._caffe2.ResizeNearest(
             torch.tensor(data).to(device),
-            order="NCHW", width_scale=2.0, height_scale=1.5,
+            order="NCHW",
+            width_scale=2.0,
+            height_scale=1.5,
         )
 
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
@@ -716,9 +785,7 @@ def test_resize_nearest_op_cuda(self):
     @given(input_data=hu.tensor(min_dim=2, max_dim=2))
     def test_Fused8BitRowwiseQuantizedToFloat(self, input_data):
         QuantizeOp = core.CreateOperator(
-            "FloatToFused8BitRowwiseQuantized",
-            ["input_data"],
-            ["quantized_data"],
+            "FloatToFused8BitRowwiseQuantized", ["input_data"], ["quantized_data"]
         )
 
         workspace.FeedBlob("input_data", input_data)
@@ -741,16 +808,15 @@ def test_piecewise_linear_op(self, binary_input):
             num_dims = 3
         data = np.random.rand(1024, num_dims).astype(np.float32)
         slopes = np.zeros(4 * num_dims).astype(np.float32)
-        bounds = np.sort(np.random.rand(5, num_dims).astype(np.float32), axis=0).flatten('F')
+        bounds = np.sort(
+            np.random.rand(5, num_dims).astype(np.float32), axis=0
+        ).flatten("F")
         intercepts = np.random.rand(4 * num_dims).astype(np.float32)
 
         def _piecewise_linear_ref(X):
             ref_op = core.CreateOperator(
                 "PiecewiseLinearTransform",
-                ["data",
-                    "bounds",
-                    "slopes",
-                    "intercepts"],
+                ["data", "bounds", "slopes", "intercepts"],
                 ["calibrated"],
                 binary=binary_input,
             )
@@ -763,7 +829,12 @@ def _piecewise_linear_ref(X):
 
         expected_output = _piecewise_linear_ref(data)
         actual_output = torch.ops._caffe2.PiecewiseLinearTransform(
-            torch.tensor(data), bounds.tolist(), slopes.tolist(), intercepts.tolist(), binary_input)
+            torch.tensor(data),
+            bounds.tolist(),
+            slopes.tolist(),
+            intercepts.tolist(),
+            binary_input,
+        )
 
         torch.testing.assert_allclose(torch.tensor(expected_output), actual_output)
 
@@ -790,9 +861,7 @@ def test_index_hash_op(self):
         data = np.random.randint(low=0, high=1000, size=(4, 4, 4))
 
         def _index_hash_ref(X):
-            ref_op = core.CreateOperator(
-                "IndexHash", ["X"], ["Y"], seed=0, modulo=100
-            )
+            ref_op = core.CreateOperator("IndexHash", ["X"], ["Y"], seed=0, modulo=100)
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
             return workspace.FetchBlob("Y")
@@ -817,33 +886,32 @@ def _bucketize_ref(X):
             return workspace.FetchBlob("Y")
 
         expected_output = _bucketize_ref(data)
-        actual_output = torch.ops._caffe2.Bucketize(
-            torch.tensor(data), boundaries
-        )
+        actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries)
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
-    @given(X=hu.tensor(),
-           eps=st.floats(min_value=1e-4, max_value=1e-2),
-           )
+    @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2))
     def test_logit(self, X, eps):
         def ref(X, eps):
-            ref_op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps)
+            ref_op = core.CreateOperator("Logit", ["X"], ["Y"], eps=eps)
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
             return workspace.FetchBlob("Y")
+
         expected_output = ref(X, eps)
-        actual_output = torch.ops._caffe2.Logit(
-            torch.tensor(X), eps
-        )
+        actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps)
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
     def test_percentile(self):
-        original_values = np.array([[3., 5., 3], [5., 1., 6.]]).astype(np.float32)
-        value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(np.float32)
+        original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32)
+        value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(
+            np.float32
+        )
         lengths = np.array([2, 1, 1]).astype(np.int32)
 
         def _percentile_ref(original_values, value_to_pct, lengths):
-            ref_op = core.CreateOperator('Percentile', ["original_values", "value_to_pct", "lengths"], ["Y"])
+            ref_op = core.CreateOperator(
+                "Percentile", ["original_values", "value_to_pct", "lengths"], ["Y"]
+            )
             workspace.FeedBlob("original_values", original_values)
             workspace.FeedBlob("value_to_pct", value_to_pct)
             workspace.FeedBlob("lengths", lengths)
@@ -852,7 +920,9 @@ def _percentile_ref(original_values, value_to_pct, lengths):
 
         expected_output = _percentile_ref(original_values, value_to_pct, lengths)
         actual_output = torch.ops._caffe2.Percentile(
-            torch.tensor(original_values), torch.Tensor(value_to_pct), torch.Tensor(lengths).int()
+            torch.tensor(original_values),
+            torch.Tensor(value_to_pct),
+            torch.Tensor(lengths).int(),
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
@@ -862,7 +932,9 @@ def test_batch_bucket_one_hot_op(self):
         boundaries = np.array([0.1, 2.5, 1, 3.1, 4.5]).astype(np.float32)
 
         def _batch_bucket_one_hot_ref(data, lengths, boundaries):
-            ref_op = core.CreateOperator('BatchBucketOneHot', ["data", "lengths", "boundaries"], ["Y"])
+            ref_op = core.CreateOperator(
+                "BatchBucketOneHot", ["data", "lengths", "boundaries"], ["Y"]
+            )
             workspace.FeedBlob("data", data)
             workspace.FeedBlob("lengths", lengths)
             workspace.FeedBlob("boundaries", boundaries)
@@ -875,31 +947,89 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries):
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
+    def test_gather_ranges_to_dense_op(self):
+        data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+        ranges = np.array([[[2, 4]], [[0, 0]]])
+        key = np.array([0, 1, 3, 2, 1, 0, 1, 0])
+        lengths = np.array([4])
+        min_observation = 2
+        max_mismatched_ratio = 0.5
+        max_empty_ratio = 1.0
+
+        outputs_name = ["X_{}".format(i) for i in range(len(lengths))]
+        ref_op = core.CreateOperator(
+            "GatherRangesToDense",
+            ["data", "ranges", "key"],
+            outputs_name,
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+        workspace.FeedBlob("data", data)
+        workspace.FeedBlob("ranges", ranges)
+        workspace.FeedBlob("key", key)
+        workspace.RunOperatorOnce(ref_op)
+        ref_outputs = []
+        for output_name in outputs_name:
+            ref_outputs.append(workspace.FetchBlob(output_name))
+
+        outputs = torch.ops._caffe2.GatherRangesToDense(
+            torch.from_numpy(data),
+            torch.from_numpy(ranges),
+            torch.from_numpy(key),
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+
+        self.assertEqual(len(ref_outputs), len(outputs))
+        for i in range(0, len(ref_outputs)):
+            np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy())
+
     @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10))
     @settings(deadline=1000)
     def test_merge_id_lists(self, lengths_0, lengths_1):
         def _merge_id_lists(lengths, values):
             ref_op = core.CreateOperator(
-                'MergeIdLists',
+                "MergeIdLists",
                 ["lengths_0", "values_0", "lengths_1", "values_1"],
-                ["merged_lengths", "merged_values"]
+                ["merged_lengths", "merged_values"],
             )
             workspace.FeedBlob("lengths_0", lengths[0])
             workspace.FeedBlob("values_0", values[0])
             workspace.FeedBlob("lengths_1", lengths[1])
             workspace.FeedBlob("values_1", values[1])
             workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("merged_lengths"), workspace.FetchBlob("merged_values")
+            return (
+                workspace.FetchBlob("merged_lengths"),
+                workspace.FetchBlob("merged_values"),
+            )
 
-        lengths = [np.array([lengths_0]).astype(np.int32), np.array([lengths_1]).astype(np.int32)]
+        lengths = [
+            np.array([lengths_0]).astype(np.int32),
+            np.array([lengths_1]).astype(np.int32),
+        ]
         values = [
-            np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(np.int32),
-            np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(np.int32)
+            np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(
+                np.int32
+            ),
+            np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(
+                np.int32
+            ),
         ]
 
-        expected_merged_lengths, expected_merged_values = _merge_id_lists(lengths, values)
+        expected_merged_lengths, expected_merged_values = _merge_id_lists(
+            lengths, values
+        )
         output_merged_lengths, output_merged_values = torch.ops._caffe2.MergeIdLists(
-            [torch.tensor(lengths[0]), torch.tensor(values[0]), torch.tensor(lengths[1]), torch.tensor(values[1])]
+            [
+                torch.tensor(lengths[0]),
+                torch.tensor(values[0]),
+                torch.tensor(lengths[1]),
+                torch.tensor(values[1]),
+            ]
         )
         torch.testing.assert_allclose(expected_merged_lengths, output_merged_lengths)
         torch.testing.assert_allclose(expected_merged_values, output_merged_values)
@@ -962,18 +1092,11 @@ def test_learning_rate(self):
     def test_pack_segments(self):
         s = torch.rand(3, 3, 3)
         lengths = torch.tensor([2, 1])
-        packed_tensor, _ = torch.ops._caffe2.PackSegments(
-            lengths,
-            s,
-        )
+        packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s)
         self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3))
-        unpacked_tensor = torch.ops._caffe2.UnpackSegments(
-            lengths,
-            packed_tensor,
-        )
+        unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor)
         torch.testing.assert_allclose(s, unpacked_tensor)
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py
index e4b739a741ac..4ccec250e22b 100644
--- a/caffe2/python/operator_test/transpose_op_test.py
+++ b/caffe2/python/operator_test/transpose_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py
index 5d57940dc33e..04b98857c301 100644
--- a/caffe2/python/operator_test/trigonometric_op_test.py
+++ b/caffe2/python/operator_test/trigonometric_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py
index 016554321983..b49f4765539e 100644
--- a/caffe2/python/operator_test/unique_ops_test.py
+++ b/caffe2/python/operator_test/unique_ops_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
index f858e8fa06bd..1026745db724 100644
--- a/caffe2/python/operator_test/unique_uniform_fill_op_test.py
+++ b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py
index a56d1edebe68..61b01644bcf5 100644
--- a/caffe2/python/operator_test/upsample_op_test.py
+++ b/caffe2/python/operator_test/upsample_op_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
index 2814d7a02775..241d1e4c1b56 100644
--- a/caffe2/python/operator_test/utility_ops_test.py
+++ b/caffe2/python/operator_test/utility_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py
index c06183c0f1bb..f21f219bd90e 100644
--- a/caffe2/python/operator_test/video_input_op_test.py
+++ b/caffe2/python/operator_test/video_input_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import os
 import shutil
diff --git a/caffe2/python/operator_test/weight_scale_test.py b/caffe2/python/operator_test/weight_scale_test.py
index 9988ebc309d2..5cdc11eb4d11 100644
--- a/caffe2/python/operator_test/weight_scale_test.py
+++ b/caffe2/python/operator_test/weight_scale_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py
index 8b0966590594..830a9f9849c7 100644
--- a/caffe2/python/operator_test/weighted_multi_sample_test.py
+++ b/caffe2/python/operator_test/weighted_multi_sample_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py
index 24326d6337c4..032e9e9d755e 100644
--- a/caffe2/python/operator_test/weighted_sample_test.py
+++ b/caffe2/python/operator_test/weighted_sample_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
index 4940bc69a052..2c7dffe92672 100644
--- a/caffe2/python/operator_test/weighted_sum_test.py
+++ b/caffe2/python/operator_test/weighted_sum_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py
index 2a48bed86358..48fe0f94731e 100644
--- a/caffe2/python/operator_test/wngrad_test.py
+++ b/caffe2/python/operator_test/wngrad_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 21a61a93d00c..9a2f9f541420 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.optimizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import copy
 import logging
diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py
index 483f08dc5aff..d1593f440383 100644
--- a/caffe2/python/optimizer_context.py
+++ b/caffe2/python/optimizer_context.py
@@ -1,9 +1,9 @@
 ## @package optimizer_context
 # Module caffe2.python.optimizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index a45571f19683..90f0932d23f6 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 from caffe2.proto import caffe2_pb2
 import caffe2.python.optimizer as optimizer
 from caffe2.python.optimizer import (
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index f7df35bfee70..02276b08c176 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -1,9 +1,9 @@
 ## @package optimizer_test_util
 # Module caffe2.python.optimizer_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
index 224dbf66b6ce..4ee446610bdb 100644
--- a/caffe2/python/parallel_workers.py
+++ b/caffe2/python/parallel_workers.py
@@ -1,9 +1,9 @@
 # @package parallel_workers
 # Module caffe2.python.parallel_workers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 '''
diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py
index a3367e6ee351..a9a7c6a078d7 100644
--- a/caffe2/python/parallel_workers_test.py
+++ b/caffe2/python/parallel_workers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py
index b3647a2007f5..c38a4ccc34d7 100644
--- a/caffe2/python/parallelize_bmuf_distributed_test.py
+++ b/caffe2/python/parallelize_bmuf_distributed_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from multiprocessing import Process, Manager
 
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index 5b30da4387f3..4625d0b0458c 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -1,9 +1,9 @@
 ## @package pipeline
 # Module caffe2.python.pipeline
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, queue_util
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py
index 5f57355b25d3..fe00933ac4e1 100644
--- a/caffe2/python/pipeline_test.py
+++ b/caffe2/python/pipeline_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import (
     Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
index 7eea50464504..e0fa90bffb6e 100644
--- a/caffe2/python/predictor/mobile_exporter.py
+++ b/caffe2/python/predictor/mobile_exporter.py
@@ -1,10 +1,10 @@
 ## @package mobile_exporter
 # Module caffe2.python.mobile_exporter
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, utils
 from caffe2.proto import caffe2_pb2
 import numpy as np
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
index 1c4cf77ea051..0269ec229888 100644
--- a/caffe2/python/predictor/mobile_exporter_test.py
+++ b/caffe2/python/predictor/mobile_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python.test_util import TestCase
 from caffe2.python import workspace, brew
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py
index e9759862fcb5..c8c68f9f30a0 100644
--- a/caffe2/python/predictor/predictor_exporter.py
+++ b/caffe2/python/predictor/predictor_exporter.py
@@ -1,9 +1,9 @@
 ## @package predictor_exporter
 # Module caffe2.python.predictor.predictor_exporter
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.proto import metanet_pb2
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index 9c8b16c30705..2a0685fb955c 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import tempfile
 import unittest
diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py
index 1af5923952dc..cc831454a08c 100644
--- a/caffe2/python/predictor/predictor_py_utils.py
+++ b/caffe2/python/predictor/predictor_py_utils.py
@@ -1,9 +1,9 @@
 ## @package predictor_py_utils
 # Module caffe2.python.predictor.predictor_py_utils
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 
diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py
index 26c4cae63b57..64c88006686c 100644
--- a/caffe2/python/predictor/predictor_test.py
+++ b/caffe2/python/predictor/predictor_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py
index af48b2920a87..2b8f1544803d 100644
--- a/caffe2/python/predictor/serde.py
+++ b/caffe2/python/predictor/serde.py
@@ -1,9 +1,9 @@
 ## @package serde
 # Module caffe2.python.predictor.serde
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def serialize_protobuf_struct(protobuf_struct):
diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py
index c1e1dedb8b09..eda0c66974f4 100644
--- a/caffe2/python/predictor_constants.py
+++ b/caffe2/python/predictor_constants.py
@@ -1,9 +1,9 @@
 ## @package predictor_constants
 # Module caffe2.python.predictor_constants
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import caffe2.proto.predictor_consts_pb2 as predictor_consts
 
 predictor_constants = predictor_consts.PredictorConsts()
diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py
index 5a8cfe4a9b46..893671b96f45 100644
--- a/caffe2/python/python_op_test.py
+++ b/caffe2/python/python_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.core import CreatePythonOperator
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
index 62265758c2f2..c9a91fc27d17 100644
--- a/caffe2/python/queue_util.py
+++ b/caffe2/python/queue_util.py
@@ -1,9 +1,9 @@
 ## @package queue_util
 # Module caffe2.python.queue_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, dataio
 from caffe2.python.task import TaskGroup
diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py
index d5f129a2f902..1170c2bf3a82 100644
--- a/caffe2/python/record_queue.py
+++ b/caffe2/python/record_queue.py
@@ -3,10 +3,10 @@
 """
 Implementation of a queue wrapper.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py
index e5b48894efbc..d4762f08c683 100644
--- a/caffe2/python/recurrent.py
+++ b/caffe2/python/recurrent.py
@@ -1,9 +1,9 @@
 ## @package recurrent
 # Module caffe2.python.recurrent
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from future.utils import viewitems, viewkeys
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
index e994de8b0c44..4042149ca80c 100644
--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.regularizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from caffe2.python import core, utils
 import numpy as np
diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py
index 6935fdcb47c0..5d79e138b6b7 100644
--- a/caffe2/python/regularizer_context.py
+++ b/caffe2/python/regularizer_context.py
@@ -1,9 +1,9 @@
 # @package regularizer_context
 # Module caffe2.python.regularizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py
index 2018040433b4..685feaf93ed2 100644
--- a/caffe2/python/regularizer_test.py
+++ b/caffe2/python/regularizer_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py
index a37eb20fda26..3f2ff2d6cc8f 100644
--- a/caffe2/python/rnn/__init__.py
+++ b/caffe2/python/rnn/__init__.py
@@ -1,5 +1,5 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
index c3bf9b30cea7..dee96413dbe5 100644
--- a/caffe2/python/rnn/lstm_comparison.py
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, lstm_benchmark, utils
 from copy import copy
diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py
index 1533c1e3d418..95728d682bfa 100644
--- a/caffe2/python/rnn/rnn_cell_test_util.py
+++ b/caffe2/python/rnn/rnn_cell_test_util.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, scope
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 8192b34dc12e..e16bfaaf491e 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -1,9 +1,9 @@
 ## @package rnn_cell
 # Module caffe2.python.rnn_cell
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 import inspect
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index 50fe136a5a12..fb7cadf42847 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -13,10 +13,10 @@
 walkthrough on how to use schema to store and iterate through a structured
 in-memory dataset.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
index 28bf5c64a428..dca19a127ef2 100644
--- a/caffe2/python/schema_test.py
+++ b/caffe2/python/schema_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 import numpy as np
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
index be05aa468d10..11fddc7b0f62 100644
--- a/caffe2/python/scope.py
+++ b/caffe2/python/scope.py
@@ -1,9 +1,9 @@
 ## @package scope
 # Module caffe2.python.scope
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import contextlib
 import threading
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index b24fc6851428..9bd69eb32902 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope, core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/serialized_test/coverage.py b/caffe2/python/serialized_test/coverage.py
index 7ba93f66af6b..2014847242c4 100644
--- a/caffe2/python/serialized_test/coverage.py
+++ b/caffe2/python/serialized_test/coverage.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index 30810d9d8283..621adca9454e 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/session.py b/caffe2/python/session.py
index 9059e1eabc94..de3b09931a30 100644
--- a/caffe2/python/session.py
+++ b/caffe2/python/session.py
@@ -1,9 +1,9 @@
 ## @package session
 # Module caffe2.python.session
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py
index ae5e50d23ec7..fa505c296820 100644
--- a/caffe2/python/session_test.py
+++ b/caffe2/python/session_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import (
     Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py
index 375068ef537e..e62c7e6d41dc 100644
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py
index 5e6d10823e5f..dc43d2c03394 100644
--- a/caffe2/python/sparse_to_dense_test.py
+++ b/caffe2/python/sparse_to_dense_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 9dcb211274b3..853433d5c38e 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -1,9 +1,5 @@
 ## @package task
 # Module caffe2.python.task
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
 
 from caffe2.python import core, context
 from caffe2.python.schema import Field, from_blob_list
@@ -354,7 +350,9 @@ def workspace_type(self):
 
     def __repr__(self):
         return "TaskGroup(tasks={}, workspace_type={}, remote_nets={})".format(
-            self.tasks(), self.workspace_type(), self.remote_nets())
+            self._tasks + self._tasks_to_add,
+            self.workspace_type(),
+            self.remote_nets())
 
 
 class TaskOutput(object):
diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py
index f1c51bc5b442..31adb41a0ac9 100644
--- a/caffe2/python/task_test.py
+++ b/caffe2/python/task_test.py
@@ -1,8 +1,3 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import unittest
 from caffe2.python import task
 
@@ -22,3 +17,8 @@ def testRepr(self):
         ]
         for obj, want in cases:
             self.assertEqual(obj.__repr__(), want)
+
+    def testEffectlessRepr(self):
+        task_group = task.TaskGroup()
+        _repr = task_group.__repr__()
+        self.assertFalse(task_group._already_used)
diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py
index 66d6835c4814..37886618ef45 100644
--- a/caffe2/python/test/blob_deallocation_test.py
+++ b/caffe2/python/test/blob_deallocation_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 import unittest
diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py
index 72e9f83c9540..fcc6918d5350 100644
--- a/caffe2/python/test/do_op_test.py
+++ b/caffe2/python/test/do_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py
index 84df86fb05b0..b4db64005f62 100644
--- a/caffe2/python/test/executor_test.py
+++ b/caffe2/python/test/executor_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test.executor_test_util import (
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
index bf93c49d8cdc..ba10247eaa2e 100644
--- a/caffe2/python/test/executor_test_util.py
+++ b/caffe2/python/test/executor_test_util.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 
 from caffe2.python import (
diff --git a/caffe2/python/test/fakefp16_transform_test.py b/caffe2/python/test/fakefp16_transform_test.py
index d58d12ad60de..f98342eba54a 100644
--- a/caffe2/python/test/fakefp16_transform_test.py
+++ b/caffe2/python/test/fakefp16_transform_test.py
@@ -1,6 +1,6 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
 
 import unittest
 from caffe2.python.fakefp16_transform_lib import fakeFp16FuseOps
diff --git a/caffe2/python/test/gpu_context_test.py b/caffe2/python/test/gpu_context_test.py
index 741f39d6dc8a..9ee8a308cc2e 100644
--- a/caffe2/python/test/gpu_context_test.py
+++ b/caffe2/python/test/gpu_context_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py
index 817f5e21a563..7790e0f6d8f5 100644
--- a/caffe2/python/test/python_protobuf_test.py
+++ b/caffe2/python/test/python_protobuf_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 # make sure we use cpp implementation of protobuf
 import os
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index a2cf3aced07c..94ac41524065 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -1,9 +1,9 @@
 ## @package test_util
 # Module caffe2.python.test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace
 
diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py
index 52a1b274f086..48f69f90c7b4 100644
--- a/caffe2/python/text_file_reader.py
+++ b/caffe2/python/text_file_reader.py
@@ -1,9 +1,9 @@
 ## @package text_file_reader
 # Module caffe2.python.text_file_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core
 from caffe2.python.dataio import Reader
 from caffe2.python.schema import Scalar, Struct, data_type_for_dtype
diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py
index 07226c128ffe..2314a3ad9c24 100644
--- a/caffe2/python/timeout_guard.py
+++ b/caffe2/python/timeout_guard.py
@@ -1,9 +1,9 @@
 ## @package timeout_guard
 # Module caffe2.python.timeout_guard
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import contextlib
 import threading
diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py
index ed0a32788de8..fc1bad34b201 100644
--- a/caffe2/python/transformations.py
+++ b/caffe2/python/transformations.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 363ceb19619d..14b97e4939ef 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
index e95cb4bd46e3..39d37ca9fa0a 100644
--- a/caffe2/python/trt/test_trt.py
+++ b/caffe2/python/trt/test_trt.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index ce45ae3cb86d..0936941aac03 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -6,10 +6,10 @@
 Note that ONNX-TRT enforce an NCHW input!
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py
index a2011da16b15..314718b76c9d 100644
--- a/caffe2/python/tt_core.py
+++ b/caffe2/python/tt_core.py
@@ -1,8 +1,8 @@
 ## @package tt_core
 # Module caffe2.python.tt_core
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py
index aec5764e66e5..0cee3b254720 100644
--- a/caffe2/python/tt_core_test.py
+++ b/caffe2/python/tt_core_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 9cf30d9c06b3..947dd9bf296d 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -1,9 +1,9 @@
 # @package utils
 # Module caffe2.python.utils
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python.compatibility import container_abcs
diff --git a/caffe2/python/utils_test.py b/caffe2/python/utils_test.py
index 3921f3d67ca7..ef809bfd8154 100644
--- a/caffe2/python/utils_test.py
+++ b/caffe2/python/utils_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, utils, test_util
 
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index f76fcf75a33a..99983e84f097 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -1,9 +1,9 @@
 ## @package workspace
 # Module caffe2.python.workspace
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import collections
 import contextlib
 from google.protobuf.message import Message
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 7e64220f480e..86dbcf5d70ba 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import os
diff --git a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
index 08f658ba9608..4f4bad64980c 100644
--- a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 from itertools import product
diff --git a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
index 27a07ece62be..1d3fd2cc369d 100644
--- a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
index 82dd1772d5da..24a2269cc850 100644
--- a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/compute_equalization_scale.cc b/caffe2/quantization/server/compute_equalization_scale.cc
new file mode 100644
index 000000000000..6e2f73ebd840
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale.cc
@@ -0,0 +1,96 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#include "caffe2/quantization/server/compute_equalization_scale.h"
+#include <functional>
+
+namespace caffe2 {
+using namespace std;
+
+bool ComputeEqualizationScaleOp::RunOnDevice() {
+  // Generate equalization scale based on the input data (last N samples of
+  // the activations) and the weight
+  const auto& X = Input(0);
+  const auto& W = Input(1);
+  CAFFE_ENFORCE_EQ(X.dim(), 2);
+  CAFFE_ENFORCE_EQ(W.dim(), 2);
+
+  const int64_t M = X.size_to_dim(1);
+  const int64_t N = W.size_to_dim(1);
+  const int64_t K = W.size_from_dim(1);
+  auto* S = Output(0, K, at::dtype<float>());
+  auto* S_INV = Output(1, K, at::dtype<float>());
+  const float* X_data = X.template data<float>();
+  const float* W_data = W.template data<float>();
+  float* S_data = S->template mutable_data<float>();
+  float* S_INV_data = S_INV->template mutable_data<float>();
+
+  float WcolMax, XcolMax;
+  for (int64_t j = 0; j < K; j++) {
+    WcolMax = std::abs(W_data[j]);
+    XcolMax = std::abs(X_data[j]);
+    int64_t idx;
+    for (int64_t i = 0; i < N; i++) {
+      idx = i * K + j;
+      WcolMax = std::max(WcolMax, std::abs(W_data[idx]));
+    }
+    for (int64_t i = 0; i < M; i++) {
+      idx = i * K + j;
+      XcolMax = std::max(XcolMax, std::abs(X_data[idx]));
+    }
+    if (WcolMax == 0 || XcolMax == 0) {
+      S_data[j] = 1;
+      S_INV_data[j] = 1;
+    } else {
+      S_data[j] = std::sqrt(WcolMax / XcolMax);
+      S_INV_data[j] = 1 / S_data[j];
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ComputeEqualizationScale, ComputeEqualizationScaleOp);
+OPERATOR_SCHEMA(ComputeEqualizationScale)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given a weight matrix W and input matrix X, the output S is the equalization parameter
+vector computed from W and X, and S_INV = 1 / S
+
+S is computed by:
+S[j] = max(abs(W[][j])) == 0 || max(abs(X[][j])) == 0 ? 1 :
+  sqrt(max(abs(W[][j])) / max(abs(X[][j]))),
+
+)DOC")
+    .TensorInferenceFunction([](const OperatorDef& /* def */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(2);
+
+      if (in[0].unknown_shape() || in[1].unknown_shape()) {
+        out[0].set_unknown_shape(true);
+        out[1].set_unknown_shape(true);
+        return out;
+      }
+      const int64_t K = size_from_dim_(1, GetDimsVector(in[1]));
+      vector<int64_t> s_shape(2);
+      s_shape[0] = 1;
+      s_shape[1] = K;
+      out[0] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT);
+      out[1] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT);
+      return out;
+    })
+    .Input(
+        0,
+        "X",
+        "The input data, or last N samples of the output activations.")
+    .Input(1, "W", "The weight that we want to equalize with the input.")
+    .Output(
+        0,
+        "S",
+        "Scale computed that will be multiplied to the columns of input.")
+    .Output(
+        1,
+        "S_INV",
+        "Scale inverse that will be multiplied to the columns of weight.")
+    .SetDoc(
+        R"DOC(Operator to compute equalization scale given the input data and weight)DOC");
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/compute_equalization_scale.h b/caffe2/quantization/server/compute_equalization_scale.h
new file mode 100644
index 000000000000..a9facf8e1206
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale.h
@@ -0,0 +1,18 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
+#include "caffe2/quantization/server/dnnlowp.h"
+
+namespace caffe2 {
+
+class ComputeEqualizationScaleOp final : public Operator<CPUContext> {
+ public:
+  ComputeEqualizationScaleOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+}; // class ComputeEqualizationScaleOp
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/compute_equalization_scale_test.py b/caffe2/quantization/server/compute_equalization_scale_test.py
new file mode 100644
index 000000000000..74d34c5502d3
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale_test.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core, workspace
+from hypothesis import given, settings
+
+
+class TestComputeEqualizationScaleOp(hu.HypothesisTestCase):
+    @settings(max_examples=10)
+    @given(
+        m=st.integers(1, 50),
+        n=st.integers(1, 50),
+        k=st.integers(1, 50),
+        rnd_seed=st.integers(1, 5),
+        **hu.gcs_cpu_only
+    )
+    def test_compute_equalization_scale(self, m, n, k, rnd_seed, gc, dc):
+        np.random.seed(rnd_seed)
+        W = np.random.rand(n, k).astype(np.float32) - 0.5
+        X = np.random.rand(m, k).astype(np.float32) - 0.5
+
+        def ref_compute_equalization_scale(X, W):
+            S = np.ones([X.shape[1]])
+            S_INV = np.ones([X.shape[1]])
+            for j in range(W.shape[1]):
+                WcolMax = np.absolute(W[:, j]).max()
+                XcolMax = np.absolute(X[:, j]).max()
+                if WcolMax and XcolMax:
+                    S[j] = np.sqrt(WcolMax / XcolMax)
+                    S_INV[j] = 1 / S[j]
+            return S, S_INV
+
+        net = core.Net("test")
+
+        ComputeEqualizationScaleOp = core.CreateOperator(
+            "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"]
+        )
+        net.Proto().op.extend([ComputeEqualizationScaleOp])
+
+        self.ws.create_blob("X").feed(X, device_option=gc)
+        self.ws.create_blob("W").feed(W, device_option=gc)
+        self.ws.run(net)
+
+        S = self.ws.blobs["S"].fetch()
+        S_INV = self.ws.blobs["S_INV"].fetch()
+        S_ref, S_INV_ref = ref_compute_equalization_scale(X, W)
+        np.testing.assert_allclose(S, S_ref, atol=1e-3, rtol=1e-3)
+        np.testing.assert_allclose(S_INV, S_INV_ref, atol=1e-3, rtol=1e-3)
+
+    def test_compute_equalization_scale_shape_inference(self):
+        X = np.array([[1, 2], [2, 4], [6, 7]]).astype(np.float32)
+        W = np.array([[2, 3], [5, 4], [8, 2]]).astype(np.float32)
+        ComputeEqualizationScaleOp = core.CreateOperator(
+            "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"]
+        )
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+
+        net = core.Net("test_shape_inference")
+        net.Proto().op.extend([ComputeEqualizationScaleOp])
+        shapes, types = workspace.InferShapesAndTypes(
+            [net],
+            blob_dimensions={"X": X.shape, "W": W.shape},
+            blob_types={"X": core.DataType.FLOAT, "W": core.DataType.FLOAT},
+        )
+        assert (
+            "S" in shapes and "S" in types and "S_INV" in shapes and "S_INV" in types
+        ), "Failed to infer the shape or type of output"
+        self.assertEqual(shapes["S"], [1, 2])
+        self.assertEqual(shapes["S_INV"], [1, 2])
+        self.assertEqual(types["S"], core.DataType.FLOAT)
+        self.assertEqual(types["S_INV"], core.DataType.FLOAT)
diff --git a/caffe2/quantization/server/concat_dnnlowp_op_test.py b/caffe2/quantization/server/concat_dnnlowp_op_test.py
index 777c523aff87..fc7e897993d4 100644
--- a/caffe2/quantization/server/concat_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/concat_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
index 70bcf53f44d4..a605ea3fc49e 100644
--- a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
index ae2f49cfe20c..68c14b69f058 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_dnnlowp_op_test.py b/caffe2/quantization/server/conv_dnnlowp_op_test.py
index 682a4d787aba..11cd12a4d5bc 100644
--- a/caffe2/quantization/server/conv_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
index 9ed9106db0be..715b6f8c01a8 100644
--- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
index 773253743c6d..99e914c294b9 100644
--- a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
index 399ae4363831..5694a553e744 100644
--- a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py
index 1a41664cb2d1..0d56ea6ac127 100644
--- a/caffe2/quantization/server/dnnlowp_test_utils.py
+++ b/caffe2/quantization/server/dnnlowp_test_utils.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
index 1cf65f37858a..75bd2f8e4d44 100644
--- a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
index 3f199f981331..af1cd0f80684 100644
--- a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
index b9104f598d08..e31b9d179071 100644
--- a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
index 9b3caf41ecc5..faf526b8c48d 100644
--- a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_sum_relu_op.cc b/caffe2/quantization/server/elementwise_sum_relu_op.cc
index df4b726c7306..dbb14c0c5ce8 100644
--- a/caffe2/quantization/server/elementwise_sum_relu_op.cc
+++ b/caffe2/quantization/server/elementwise_sum_relu_op.cc
@@ -42,11 +42,13 @@ class SumReluOp : public SumOp<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float, float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double, double>();
     } else if (Input(0).template IsType<int>()) {
       return DoRunWithType<int, int>();
     } else {
       CAFFE_THROW(
-          "Sum operator only supports 32-bit float and ints, but",
+          "Sum operator only supports 32-bit float, 64-bit double and ints, but",
           " input was of type ",
           Input(0).dtype().name());
     }
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
index 68059421cfac..5d77eceb8e04 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
index b8c4a3e22812..f1939e198b84 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_fp16_test.py b/caffe2/quantization/server/fully_connected_fp16_test.py
index 710207f7caeb..be1e2c8a1ab5 100644
--- a/caffe2/quantization/server/fully_connected_fp16_test.py
+++ b/caffe2/quantization/server/fully_connected_fp16_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
index a4ba681867ff..284ae56d743e 100644
--- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/gather_dnnlowp_op_test.py b/caffe2/quantization/server/gather_dnnlowp_op_test.py
index c1f495260722..c2c7f35a66d4 100644
--- a/caffe2/quantization/server/gather_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/gather_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
index 93a4163c86bb..30051d95b59c 100644
--- a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/int8_gen_quant_params_test.py b/caffe2/quantization/server/int8_gen_quant_params_test.py
index f2c7fd81dabb..d208d6f9b575 100644
--- a/caffe2/quantization/server/int8_gen_quant_params_test.py
+++ b/caffe2/quantization/server/int8_gen_quant_params_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
index f34081aeba24..70f9b0c2f1fa 100644
--- a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
+++ b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core, workspace
diff --git a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
index 9cd22bd2c491..bcf06ce0274e 100644
--- a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/observer_test.py b/caffe2/quantization/server/observer_test.py
index 4299c146b2da..5c2b28e5e6fb 100644
--- a/caffe2/quantization/server/observer_test.py
+++ b/caffe2/quantization/server/observer_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import core, workspace
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_test.py b/caffe2/quantization/server/pool_dnnlowp_op_test.py
index d581fbef00cd..fedc87ee732a 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/pool_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/quantize_dnnlowp_op_test.py b/caffe2/quantization/server/quantize_dnnlowp_op_test.py
index caaf456fb84e..e61a28b4b930 100644
--- a/caffe2/quantization/server/quantize_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/quantize_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/relu_dnnlowp_op_test.py b/caffe2/quantization/server/relu_dnnlowp_op_test.py
index 5e85b4e43ed6..68b5aed049f1 100644
--- a/caffe2/quantization/server/relu_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/relu_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
index 47ae47b81106..67017ee0afcc 100644
--- a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
index 6af92a5d2fe5..b12b3908aafa 100644
--- a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
index 28ff4a0a750b..836745dcf543 100644
--- a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
index b1d34c19d3ae..d7253b1675f4 100644
--- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/tanh_dnnlowp_op_test.py b/caffe2/quantization/server/tanh_dnnlowp_op_test.py
index e0af7af62bba..f73befd25e26 100644
--- a/caffe2/quantization/server/tanh_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/tanh_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/utils.py b/caffe2/quantization/server/utils.py
index 862ed5a9cd62..9e137cb5f6af 100644
--- a/caffe2/quantization/server/utils.py
+++ b/caffe2/quantization/server/utils.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import copy
 import logging
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 64ab19937929..bb70e0f85885 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -113,6 +113,12 @@ class SafeEnqueueBlobsOp final : public Operator<Context> {
         1, !status, Output(size)->template mutable_data<bool>(), &context_);
     return true;
   }
+
+  void Cancel() override {
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    queue->close();
+  }
 };
 
 template <typename Context>
@@ -192,6 +198,12 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
     return true;
   }
 
+  void Cancel() override {
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    queue->close();
+  }
+
  private:
   int numRecords_;
   std::vector<Blob> blobs_;
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
index 814a24c74183..2347f0cd8bc8 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cu
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
@@ -308,69 +308,132 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel(
 
   const float LR = lr[0];
   // num_indices blocks, each block process one index
-  int sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind
+  int sorted_linear_indice_id;
+  if (ExactBlock) {
+    sorted_linear_indice_id =
+        blockIdx.x * blockDim.y + threadIdx.y; // the index of sorted_linear_ind
+  } else {
+    sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind
+  }
   if (sorted_linear_indice_id >= num_indices) {
     // don't have warp divergence when embedding dim is multiple of 32
     return;
   }
 
+  // the index row in the embedding table
+  SIndex index = sorted_linear_ind_data[sorted_linear_indice_id];
+
   // check if this thread block is responsible for this whole linear index
   bool linear_index_start =
       (sorted_linear_indice_id == 0 ||
-       sorted_linear_ind_data[sorted_linear_indice_id - 1] !=
-           sorted_linear_ind_data[sorted_linear_indice_id]);
+       sorted_linear_ind_data[sorted_linear_indice_id - 1] != index);
 
   if (!linear_index_start) {
     // don't have warp divergence when embedding dim is multiple of 32
     return;
   }
 
-  // the index row in the embedding table
-  SIndex index = sorted_linear_ind_data[sorted_linear_indice_id];
-  // find the num of duplicated indices.
-  int num_dup = 1;
-  while (sorted_linear_indice_id + num_dup < num_indices &&
-         sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) {
-    num_dup += 1;
-  }
+  if (ExactBlock) {
+    // find the num of duplicated indices.
+    int num_dup = 1;
+    while (true) {
+      int segment_continue = 0;
+      if (sorted_linear_indice_id + num_dup + threadIdx.x < num_indices) {
+        segment_continue =
+            sorted_linear_ind_data[sorted_linear_indice_id + num_dup + threadIdx.x] ==
+            index;
+      }
+#ifndef __HIP_PLATFORM_HCC__
+      int32_t num_dup_incr = __popc(__ballot_sync(0xFFFFFFFF, segment_continue));
+#else
+      int32_t num_dup_incr = __popc(__ballot(segment_continue));
+#endif
+      num_dup += num_dup_incr;
+      if (num_dup_incr != kWarpSize) {
+        break;
+      }
+    }
 
-  // TODO: Tuning NumThreads for sum_squares
-  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
-  __shared__ BlockReduce::TempStorage temp_storage;
-  int valid = min(block_size, blockDim.x);
+    float sum_squares = 0.0;
+    extern __shared__ float x_ij[];
 
-  float sum_squares = 0.0;
-  __shared__ float row_sum_squares_avg;
-  extern __shared__ float x_ij[];
+    // we need to avoid index collision for the threads in the same block.
+    // Different threadIdx.y works on different `index`.
+    int sm_offset = threadIdx.y * block_size;
 
-  for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
-    // i: index in the embedding dimension
-    float t_x_ij = 0.0;
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      // i: index in the embedding dimension
+      float t_x_ij = 0.0;
 
-    for (int dup_id = 0; dup_id < num_dup; dup_id++) {
-      int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
-      t_x_ij += grad[group * block_size + i];
+      for (int dup_id = 0; dup_id < num_dup; dup_id++) {
+        int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
+        t_x_ij += grad[group * block_size + i];
+      }
+      t_x_ij += weight_decay *
+          rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);
+      sum_squares += t_x_ij * t_x_ij;
+
+      x_ij[sm_offset + i] = t_x_ij;
     }
-    t_x_ij += weight_decay *
-      rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);;
-    sum_squares += t_x_ij * t_x_ij;
-    x_ij[i] = t_x_ij;
-  }
-  float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
 
-  if (threadIdx.x == 0) {
-    row_sum_squares_avg = reduce_result / static_cast<float>(block_size);
-    float mom_new = param_mom[index] + static_cast<T>(row_sum_squares_avg);
+    // We have a strong assumption that blockDim.x = 32, which is equal to the warp size.
+    float row_sum_squares_avg = warpReduceAllSum<float>(sum_squares) / static_cast<float>(block_size);
+    float mom_new = param_mom[index] + row_sum_squares_avg;
     param_mom[index] = mom_new;
-  }
-  __syncthreads();
 
-  // update param
-  float step = LR / (sqrtf(param_mom[index]) + epsilon);
-  for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
-    const size_t paramIdx = index * block_size + i; // index for param
-    param[paramIdx] =
-        rand_factor.convertTypeFromTargetToParam(param[paramIdx] + x_ij[i] * step);
+    // update param
+    float step = LR / (sqrtf(mom_new) + epsilon);
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      const size_t paramIdx = index * block_size + i; // index for param
+      param[paramIdx] = rand_factor.convertTypeFromTargetToParam(
+          rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[sm_offset + i] * step);
+    }
+  } else {
+    // find the num of duplicated indices.
+    int num_dup = 1;
+    while (sorted_linear_indice_id + num_dup < num_indices &&
+          sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) {
+      num_dup += 1;
+    }
+
+    // TODO: Tuning NumThreads for sum_squares
+    typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+    __shared__ BlockReduce::TempStorage temp_storage;
+    int valid = min(block_size, blockDim.x);
+
+    float sum_squares = 0.0;
+    __shared__ float row_sum_squares_avg;
+    extern __shared__ float x_ij[];
+
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      // i: index in the embedding dimension
+      float t_x_ij = 0.0;
+
+      for (int dup_id = 0; dup_id < num_dup; dup_id++) {
+        int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
+        t_x_ij += grad[group * block_size + i];
+      }
+      t_x_ij += weight_decay *
+          rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);
+      sum_squares += t_x_ij * t_x_ij;
+      x_ij[i] = t_x_ij;
+    }
+    float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
+
+    if (threadIdx.x == 0) {
+      row_sum_squares_avg = reduce_result / static_cast<float>(block_size);
+      float mom_new = param_mom[index] + row_sum_squares_avg;
+      param_mom[index] = mom_new;
+    }
+    __syncthreads();
+
+    // update param
+    float step = LR / (sqrtf(param_mom[index]) + epsilon);
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      const size_t paramIdx = index * block_size + i; // index for param
+      param[paramIdx] = rand_factor.convertTypeFromTargetToParam(
+        rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[i] * step);
+    }
   }
 }
 
@@ -570,7 +633,10 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
       gradient_mean_kernel<T>
-          <<<num_lengths, std::min(maxThreads, block_size), 0, context_.cuda_stream()>>>(
+          <<<num_lengths,
+             std::min(maxThreads, block_size),
+             0,
+             context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
     }
 
@@ -934,7 +1000,10 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
       gradient_mean_kernel<T>
-          <<<num_lengths, std::min(maxThreads, block_size), 0, context_.cuda_stream()>>>(
+          <<<num_lengths,
+             std::min(maxThreads, block_size),
+             0,
+             context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
     }
 
@@ -1179,10 +1248,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
     sorted_seg_id_buffer_.ResizeLike(indicesInput);
 
     linear_index_weight_offsets_dedup_kernel<IndexType>
-        <<<num_lengths,
-           32,
-           0,
-           context_.cuda_stream()>>>(
+        <<<num_lengths, 32, 0, context_.cuda_stream()>>>(
             indices,
             prefix_sum_length_data,
             seg_id_buffer_.template mutable_data<int>());
@@ -1206,60 +1272,137 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
       seed.y = maxThreads * block_size;
     }
 
-    CAFFE_ENFORCE_LE(block_size, 10240,
-      "Block size is too big and will exceed the max size of the shared memory");
-    if (round_option_ == STOCHASTIC) {
-      rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
-          IndexType,
-          TParam,
-          T,
-          false,
-          STOCHASTIC>
-          <<<num_indices,
-             std::min(maxThreads, block_size),
-             block_size * sizeof(float),
-             context_.cuda_stream()>>>(
-              prefix_sum_length_data,
-              N,
-              block_size,
-              num_lengths,
-              num_indices,
-              epsilon_,
-              paramOut,
-              momentOut,
-              indices,
-              is_mean ? grad_buffer_data : grad,
-              sorted_linear_ind_buffer_.template data<IndexType>(),
-              sorted_seg_id_buffer_.template data<int>(),
-              lr,
-              seed,
-              weight_decay_);
+    if (block_size <= maxThreads / 2 && block_size % 32 == 0) {
+      // Fast path when the embedding dimension is a multiple of 32, using
+      // WarpReduce.
+      constexpr int kWarpNum = 8;
+      const dim3 threads(kWarpSize, kWarpNum);
+      const dim3 blocks((num_indices + kWarpNum - 1) / kWarpNum);
+      CAFFE_ENFORCE_LE(
+          kWarpNum * kWarpSize,
+          maxThreads,
+          "the total number of threads in a block should be smaller than or equal to maxThreads");
+
+      const int sm_size = block_size * kWarpNum * sizeof(float);
+      // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal
+      CAFFE_ENFORCE_LE(
+        sm_size,
+        1024 * 48,
+        "Block size is too big and will exceed the max size of the shared memory");
+
+      if (round_option_ == STOCHASTIC) {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            true,
+            STOCHASTIC>
+            <<<blocks,
+               threads,
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      } else {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            true,
+            NEAREST>
+            <<<blocks,
+               threads,
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      }
     } else {
-      rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
-          IndexType,
-          TParam,
-          T,
-          false,
-          NEAREST>
-          <<<num_indices,
-             std::min(maxThreads, block_size),
-             block_size * sizeof(float),
-             context_.cuda_stream()>>>(
-              prefix_sum_length_data,
-              N,
-              block_size,
-              num_lengths,
-              num_indices,
-              epsilon_,
-              paramOut,
-              momentOut,
-              indices,
-              is_mean ? grad_buffer_data : grad,
-              sorted_linear_ind_buffer_.template data<IndexType>(),
-              sorted_seg_id_buffer_.template data<int>(),
-              lr,
-              seed,
-              weight_decay_);
+      const int sm_size = block_size * sizeof(float);
+      // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal
+      CAFFE_ENFORCE_LE(
+        sm_size,
+        1024 * 48,
+        "Block size is too big and will exceed the max size of the shared memory");
+      if (round_option_ == STOCHASTIC) {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            false,
+            STOCHASTIC>
+            <<<num_indices,
+               std::min(maxThreads, block_size),
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      } else {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            false,
+            NEAREST>
+            <<<num_indices,
+               std::min(maxThreads, block_size),
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      }
     }
 
     return true;
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cuh b/caffe2/sgd/adagrad_fused_op_gpu.cuh
index 9a5f53bead12..e695dac37e4d 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cuh
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cuh
@@ -26,6 +26,27 @@
 
 namespace caffe2 {
 
+constexpr int kWarpSize = 32;
+
+template <typename T>
+inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
+#else
+  return __shfl_xor(val, laneMask, width);
+#endif
+}
+
+/// Sums a register value across all warp threads
+template <typename T, int ReduceWidth = kWarpSize>
+inline __device__ T warpReduceAllSum(T val) {
+#pragma unroll
+  for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
+    val += shfl_xor(val, mask);
+  }
+  return val;
+}
+
 enum roundOption : int { NEAREST = 0, STOCHASTIC = 1 };
 
 template <typename paramType, typename targetType, roundOption roundOpt>
diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc
index 534f89d68360..e8172ab65efe 100644
--- a/caffe2/sgd/learning_rate_op.cc
+++ b/caffe2/sgd/learning_rate_op.cc
@@ -164,7 +164,7 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
     "int? max_iter = -1, "
     "int? num_iter = 0, "
     "float? start_multiplier = 0, "
-    "float? end_mulitplier = 0, "
+    "float? end_multiplier = 0, "
     "float? multiplier = 0.5, "
     "float? multiplier_1 = 1.0, "
     "float? multiplier_2 = 1.0, "
@@ -184,5 +184,6 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
     "float? cosine_max_lr = 0.05, "
     "int? cosine_period = 50, "
     "float? cosine_t_mult = 1.0, "
-    "float? cosine_lr_shrink = 0.99) -> Tensor output",
+    "float? cosine_lr_shrink = 0.99, "
+    "float? decay = 1.0) -> Tensor output",
     LearningRateOpFloatCPU);
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index fa35ad4c8d6f..fb0998a65d71 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -62,7 +62,7 @@ class LearningRateOp final : public Operator<Context> {
           active_period, inactive_period, active_first);
     } else if (policy == "hill") {
       int64_t num_iter =
-          this->template GetSingleArgument<int>(arg_prefix + "num_iter", 0);
+          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter", 0);
       DCHECK_GT(num_iter, 0);
       T start_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_multiplier", 0.);
@@ -81,13 +81,13 @@ class LearningRateOp final : public Operator<Context> {
       return new HillLearningRate<T>(
           num_iter, start_multiplier, gamma, power, end_multiplier);
     } else if (policy == "slope") {
-      int64_t num_iter_1 =
-          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter_1", 0);
+      int64_t num_iter_1 = this->template GetSingleArgument<int64_t>(
+          arg_prefix + "num_iter_1", 0);
       DCHECK_GT(num_iter_1, 0);
       T multiplier_1 = this->template GetSingleArgument<float>(
           arg_prefix + "multiplier_1", 0.);
-      int64_t num_iter_2 =
-          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter_2", 0);
+      int64_t num_iter_2 = this->template GetSingleArgument<int64_t>(
+          arg_prefix + "num_iter_2", 0);
       DCHECK_GT(num_iter_1, 0);
       T multiplier_2 = this->template GetSingleArgument<float>(
           arg_prefix + "multiplier_2", 0.);
@@ -191,16 +191,16 @@ class LearningRateOp final : public Operator<Context> {
       int stepsize =
           this->template GetSingleArgument<int>(arg_prefix + "stepsize", 0);
       T decay =
-          this->template GetSingleArgument<int>(arg_prefix + "decay", 1.0);
+          this->template GetSingleArgument<float>(arg_prefix + "decay", 1.0);
       DCHECK_GT(stepsize, 0);
       DCHECK_GE(max_lr, base_lr_);
       return new CyclicalLearningRate<T>(base_lr_, max_lr, stepsize, decay);
     } else if (policy == "constantThenLinearWarmup") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       return new ConstantThenLinearWarmupLearningRate<T>(
           start_warmup_multiplier,
@@ -209,9 +209,9 @@ class LearningRateOp final : public Operator<Context> {
     } else if (policy == "compositeCyclical") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       T cyclical_max_lr = this->template GetSingleArgument<float>(
           arg_prefix + "cyclical_max_lr", 0.05);
@@ -245,9 +245,9 @@ class LearningRateOp final : public Operator<Context> {
     } else if (policy == "compositeCosine") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       T cosine_max_lr = this->template GetSingleArgument<float>(
           arg_prefix + "cosine_max_lr", 0.5);
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 798985953b89..62190501cdac 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,9 +1,13 @@
 if((NOT BUILD_CAFFE2) OR (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
   list(APPEND Caffe2_CPU_SRCS
     utils/string_utils.cc
-    utils/threadpool/pthreadpool-cpp.cc
     utils/threadpool/ThreadPool.cc
   )
+
+  if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
+    list(APPEND Caffe2_CPU_SRCS utils/threadpool/pthreadpool-cpp.cc)
+  endif()
+
   if(NOT BUILD_CAFFE2)
     list(APPEND Caffe2_CPU_SRCS
       utils/proto_wrap.cc
diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
index 46d8058c84b5..be591cc95b92 100644
--- a/caffe2/utils/GpuDefs.cuh
+++ b/caffe2/utils/GpuDefs.cuh
@@ -7,16 +7,9 @@ namespace caffe2 {
 
 // Static definition of GPU warp size for unrolling and code generation
 
-#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 800
-constexpr int kWarpSize = 32;
-#else
-#error Unknown __CUDA_ARCH__; please define parameters for compute capability
-#endif // __CUDA_ARCH__ types
-#elif defined(__HIP_PLATFORM_HCC__)
+#if defined(__HIP_PLATFORM_HCC__)
 constexpr int kWarpSize = warpSize;   // = 64 (Defined in hip_runtime.h)
 #else
-// dummy value for host compiler
 constexpr int kWarpSize = 32;
 #endif // __CUDA_ARCH__
 
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 63e2d9f4d934..dbfd55e2d0d5 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -144,7 +144,7 @@ if(INTERN_BUILD_ATEN_OPS)
   endforeach()
   list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
 
-  file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
+  file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
 
   set(GEN_ROCM_FLAG)
   if(USE_ROCM)
@@ -167,7 +167,7 @@ if(INTERN_BUILD_ATEN_OPS)
     endif()
     execute_process(
       COMMAND
-      "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_whitelist.py
+      "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_allowlist.py
       --op-dependency "${OP_DEPENDENCY}"
       --root-ops "${SELECTED_OP_LIST}"
       OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 028098f61d36..1bbb98fb3614 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1253,10 +1253,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_GLOO)
-  if(MSVC)
-    message(WARNING "Gloo can not be used on Windows.")
-    caffe2_update_option(USE_GLOO OFF)
-  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(WARNING "Gloo can only be used on 64-bit systems.")
     caffe2_update_option(USE_GLOO OFF)
   else()
@@ -1507,7 +1504,8 @@ if(NOT INTERN_BUILD_MOBILE)
 
   if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
     message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
-    list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__")
+    list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__"
+      "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__")
     add_compile_options(-DCUDA_HAS_FP16=1)
   else()
     message(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index c17dfa751417..9caf2f408a16 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -84,9 +84,19 @@ endif()
 
 if(CUDA_VERSION VERSION_GREATER "10.5")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.0+PTX")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
 
+  if(CUDA_VERSION VERSION_LESS "11.1")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.6")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
+  endif()
+endif()
+
+if(CUDA_VERSION VERSION_GREATER "11.0")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6" "8.6+PTX")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
+
   if(CUDA_VERSION VERSION_LESS "12.0")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
   endif()
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 32cab7557f3b..9a4ad35567bd 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -196,7 +196,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
 
         # If we remove all reference to these pb.h files from external
         # libraries and binaries this rewrite can be removed.
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
 
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
@@ -209,6 +209,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
         COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil}
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil}
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
     endif()
diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake
index 2124b6189799..704dcd7da154 100644
--- a/cmake/ProtoBufPatch.cmake
+++ b/cmake/ProtoBufPatch.cmake
@@ -1,41 +1,83 @@
 # CMake file to replace the string contents in ONNX, Caffe, and Caffe2 proto.
 # Usage example:
-#   cmake -DFILENAME=caffe2.pb.h -P ProtoBufPatch.cmake
+#   cmake -DFILENAME=caffe2.pb.h -DLOCAL_PROTOBUF=ON -P ProtoBufPatch.cmake
 
 file(READ ${FILENAME} content)
 
-# protobuf-3.6.0 pattern
-string(
-  REPLACE
-  "::google::protobuf::internal::GetEmptyStringAlreadyInited"
-  "GetEmptyStringAlreadyInited"
-  content
-  "${content}")
+if(LOCAL_PROTOBUF)
+  # protobuf-3.6.0 pattern
+  string(
+    REPLACE
+    "::google::protobuf::internal::GetEmptyStringAlreadyInited"
+    "GetEmptyStringAlreadyInited"
+    content
+    "${content}")
 
-# protobuf-3.8.0+ pattern
-string(
-  REPLACE
-  "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited"
-  "GetEmptyStringAlreadyInited"
-  content
-  "${content}")
+  # protobuf-3.8.0+ pattern
+  string(
+    REPLACE
+    "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited"
+    "GetEmptyStringAlreadyInited"
+    content
+    "${content}")
 
-string(
-  REPLACE
-  "PROTOBUF_CONSTEXPR"
-  ""
-  content
-  "${content}")
+  string(
+    REPLACE
+    "PROTOBUF_CONSTEXPR"
+    ""
+    content
+    "${content}")
 
-# https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c
-# changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows
-# build.
-string(
-  REGEX REPLACE
-  "static constexpr ([^ ]+) ([^ ]+) ="
-  "static \\1 const \\2 ="
-  content
-  "${content}")
+  # https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c
+  # changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows
+  # build.
+  string(
+    REGEX REPLACE
+    "static constexpr ([^ ]+) ([^ ]+) ="
+    "static \\1 const \\2 ="
+    content
+    "${content}")
+
+  foreach(ns ${NAMESPACES})
+    # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within
+    # the namespace and make sure we only do it once in the file. Unfortunately
+    # using string(REPLACE ...) doesn't work because it will replace at all
+    # locations and there might be multiple declarations of the namespace
+    # depending on how the proto is structured.
+    set(search "namespace ${ns} {")
+    string(LENGTH "${search}" search_len)
+    string(FIND "${content}" "${search}" pos)
+    if(${pos} GREATER -1)
+      math(EXPR pos "${pos}+${search_len}")
+      string(SUBSTRING "${content}" 0 ${pos} content_pre)
+      string(SUBSTRING "${content}" ${pos} -1 content_post)
+      string(
+        CONCAT
+        content
+        "${content_pre}"
+        " const ::std::string& GetEmptyStringAlreadyInited(); "
+        "${content_post}")
+    endif()
+  endforeach()
+
+  # The moving constructor is defined in the header file, which will cause
+  # a link error that claims that the vftable is not found. Luckily, we
+  # could move the definition into the source file to solve the problem.
+  list(LENGTH NAMESPACES ns_count)
+  if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1)
+    string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME})
+    file(READ ${SOURCE_FILENAME} content_cc_origin)
+
+    string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}")
+    string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}")
+    string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" "  \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}")
+    set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}")
+
+    string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}")
+
+    file(WRITE ${SOURCE_FILENAME} "${content_cc}")
+  endif()
+endif()
 
 # constexpr int TensorBoundShape_DimType_DimType_ARRAYSIZE = TensorBoundShape_DimType_DimType_MAX + 1;
 # throws
@@ -53,44 +95,4 @@ string(
   content
   "${content}")
 
-foreach(ns ${NAMESPACES})
-  # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within
-  # the namespace and make sure we only do it once in the file. Unfortunately
-  # using string(REPLACE ...) doesn't work because it will replace at all
-  # locations and there might be multiple declarations of the namespace
-  # depending on how the proto is structured.
-  set(search "namespace ${ns} {")
-  string(LENGTH "${search}" search_len)
-  string(FIND "${content}" "${search}" pos)
-  if(${pos} GREATER -1)
-    math(EXPR pos "${pos}+${search_len}")
-    string(SUBSTRING "${content}" 0 ${pos} content_pre)
-    string(SUBSTRING "${content}" ${pos} -1 content_post)
-    string(
-      CONCAT
-      content
-      "${content_pre}"
-      " const ::std::string& GetEmptyStringAlreadyInited(); "
-      "${content_post}")
-  endif()
-endforeach()
-
-# The moving constructor is defined in the header file, which will cause
-# a link error that claims that the vftable is not found. Luckily, we
-# could move the definition into the source file to solve the problem.
-list(LENGTH NAMESPACES ns_count)
-if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1)
-  string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME})
-  file(READ ${SOURCE_FILENAME} content_cc_origin)
-
-  string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}")
-  string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}")
-  string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" "  \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}")
-  set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}")
-
-  string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}")
-
-  file(WRITE ${SOURCE_FILENAME} "${content_cc}")
-endif()
-
 file(WRITE ${FILENAME} "${content}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3d4da7f06176..9d848c60c987 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -44,6 +44,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    Python site-packages: ${PYTHON_SITE_PACKAGES}")
   endif()
   message(STATUS "  BUILD_SHARED_LIBS     : ${BUILD_SHARED_LIBS}")
+  message(STATUS "  CAFFE2_USE_MSVC_STATIC_RUNTIME     : ${CAFFE2_USE_MSVC_STATIC_RUNTIME}")
   message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
   message(STATUS "  BUILD_JNI             : ${BUILD_JNI}")
   message(STATUS "  BUILD_MOBILE_AUTOGRAD : ${BUILD_MOBILE_AUTOGRAD}")
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 8b60915f7e00..c9ac37783d1c 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -478,7 +478,7 @@ foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration
 endforeach()
 
 # Set C++14 support
-set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror")
+set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror")
 if(MSVC)
   list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call")
   list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
@@ -490,7 +490,7 @@ endif()
 # OpenMP flags for NVCC with Clang-cl
 if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC"
   AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Xclang" "-fopenmp")
+  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp")
   if(MSVC_TOOLSET_VERSION LESS 142)
     list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp")
   else()
diff --git a/docker.Makefile b/docker.Makefile
index ba53b94d7898..18acced1de8d 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -9,7 +9,7 @@ DOCKER_ORG       = $(shell whoami)
 endif
 
 BASE_RUNTIME     = ubuntu:18.04
-BASE_DEVEL       = nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+BASE_DEVEL       = nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
 
 # The conda channel to use to install pytorch / torchvision
 INSTALL_CHANNEL  = pytorch
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 2bfbe63f47c6..39c63ddd5d7b 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -1,20 +1,20 @@
 PyTorch C++ API
 ===============
 
-These pages provide documentation for the public portions of the PyTorch C++
+These pages provide the documentation for the public portions of the PyTorch C++
 API.  This API can roughly be divided into five parts:
 
-- **ATen**: The foundational tensor and mathematical operation library on which all else is built;
-- **Autograd**: Augments ATen with automatic differentiation;
-- **C++ Frontend**: High level constructs for training and evaluation of machine learning models;
-- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter;
+- **ATen**: The foundational tensor and mathematical operation library on which all else is built.
+- **Autograd**: Augments ATen with automatic differentiation.
+- **C++ Frontend**: High level constructs for training and evaluation of machine learning models.
+- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter.
 - **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
 
-Together, these building blocks form a research and
+Combining, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
-production; we look forward to welcoming more users of the PyTorch C++ API.
+production; we are looking forward to welcome more users of the PyTorch C++ API.
 
 .. warning::
 
@@ -76,7 +76,7 @@ C++ Frontend
 ------------
 
 The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
-neural network and general machine learning research and production use cases,
+neural network and general ML(Machine Learning) research and production use cases,
 largely following the Python API in design and provided functionality. The C++
 frontend includes the following:
 
@@ -119,7 +119,7 @@ expanded on a continuous and active basis.
 TorchScript
 -----------
 
-TorchScript a representation of a PyTorch model that can be understood,
+TorchScript is a representation of a PyTorch model that can be understood,
 compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript
 is a programming language in its own right. It is a subset of Python using
 the PyTorch API.  The C++ interface to TorchScript encompasses three primary pieces of
@@ -150,7 +150,7 @@ CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API
 does not add any new functionality to the PyTorch C++ API. Instead, it
 provides integration with Python setuptools as well as JIT compilation
 mechanisms that allow access to ATen, the autograd and other C++ APIs from
-Python. To learn more about the C++ extension API, see
+Python. To learn more about the C++ extension API, go through
 `this tutorial <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_.
 
 Contents
@@ -183,4 +183,4 @@ Acknowledgements
 This documentation website for the PyTorch C++ universe has been enabled by the
 `Exhale <https://github.com/svenevs/exhale/>`_ project and generous investment
 of time and effort by its maintainer, `svenevs <https://github.com/svenevs/>`_.
-We thank Stephen for his work and his help with the PyTorch C++ documentation.
+We thank Stephen for his work and his efforts providing help with the PyTorch C++ documentation.
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index c152ca616571..f346fbe994e6 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -25,7 +25,6 @@ torch.*
 torch.nn
 ~~~~~~~~
 
--  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)
diff --git a/docs/source/data.rst b/docs/source/data.rst
index 9ba88f02c31f..c5d6f61b7ba9 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -403,6 +403,7 @@ Example::
 .. autoclass:: TensorDataset
 .. autoclass:: ConcatDataset
 .. autoclass:: ChainDataset
+.. autoclass:: BufferedShuffleDataset
 .. autoclass:: Subset
 .. autofunction:: torch.utils.data.get_worker_info
 .. autofunction:: torch.utils.data.random_split
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index a248d3e4ca83..c83b5a1d34de 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -52,12 +52,22 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
 Backends that come with PyTorch
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-PyTorch distributed currently only supports Linux. By default, the Gloo and NCCL backends
-are built and included in PyTorch distributed (NCCL only when building with CUDA).
-MPI is an
-optional backend that can only be included if you build PyTorch from source. (e.g.
-building PyTorch on a host that has MPI installed.)
+PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
+By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
+distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
+included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
+installed.)
 
+.. warning ::
+    As of PyTorch v1.7, Windows support for the distributed package only covers collective
+    communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore,
+    the `init_method` argument in :func:`init_process_group` must point to a file. This works
+    for both local and shared file systems:
+
+    - Local file system, ``init_method="file:///d:/tmp/some_file"``
+    - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``
+
+    Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance.
 
 Which backend to use?
 ^^^^^^^^^^^^^^^^^^^^^
@@ -260,6 +270,31 @@ The machine with rank 0 will be used to set up all connections.
 This is the default method, meaning that ``init_method`` does not have to be specified (or
 can be ``env://``).
 
+Distributed Key-Value Store
+---------------------------
+
+The distributed package comes with a distributed key-value store, which can be
+used to share information between processes in the group as well as to
+initialize the distributed pacakge in
+:func:`torch.distributed.init_process_group` (by explicitly creating the store
+as an alternative to specifying ``init_method``.) There are 3 choices for
+Key-Value Stores: :class:`~torch.distributed.TCPStore`,
+:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`.
+
+.. autoclass:: Store
+.. autoclass:: TCPStore
+.. autoclass:: HashStore
+.. autoclass:: FileStore
+.. autoclass:: PrefixStore
+
+.. autofunction:: torch.distributed.Store.set
+.. autofunction:: torch.distributed.Store.get
+.. autofunction:: torch.distributed.Store.add
+.. autofunction:: torch.distributed.Store.wait
+.. autofunction:: torch.distributed.Store.num_keys
+.. autofunction:: torch.distributed.Store.delete_key
+.. autofunction:: torch.distributed.Store.set_timeout
+
 Groups
 ------
 
diff --git a/docs/source/fft.rst b/docs/source/fft.rst
index 8ec06a3574d2..ab50bd271d32 100644
--- a/docs/source/fft.rst
+++ b/docs/source/fft.rst
@@ -1,6 +1,8 @@
 .. role:: hidden
     :class: hidden-section
 
+.. _torch-fft-module:
+
 torch.fft
 =========
 
@@ -19,7 +21,11 @@ Functions
 
 .. autofunction:: fft
 .. autofunction:: ifft
+.. autofunction:: fftn
+.. autofunction:: ifftn
 .. autofunction:: rfft
 .. autofunction:: irfft
+.. autofunction:: rfftn
+.. autofunction:: irfftn
 .. autofunction:: hfft
 .. autofunction:: ihfft
diff --git a/docs/source/jit_language_reference.rst b/docs/source/jit_language_reference.rst
index 4cca46fdc005..205195f59f6b 100644
--- a/docs/source/jit_language_reference.rst
+++ b/docs/source/jit_language_reference.rst
@@ -72,6 +72,7 @@ net models. In particular, TorchScript supports:
    "``Optional[T]``", "A value which is either None or type ``T``"
    "``Dict[K, V]``", "A dict with key type ``K`` and value type ``V``. Only ``str``, ``int``, and ``float`` are allowed as key types."
    "``T``", "A `TorchScript Class`_"
+   "``E``", "A `TorchScript Enum`_"
    "``NamedTuple[T0, T1, ...]``", "A :func:`collections.namedtuple <collections.namedtuple>` tuple type"
 
 Unlike Python, each variable in TorchScript function must have a single static type.
@@ -271,6 +272,7 @@ Example (refining types on parameters and locals):
     module = torch.jit.script(M(2))
     module = torch.jit.script(M(None))
 
+
 .. _TorchScript Class:
 .. _TorchScript Classes:
 .. _torchscript-classes:
@@ -346,6 +348,37 @@ like any other TorchScript type:
     print(sum_pair(p))
 
 
+.. _TorchScript Enum:
+.. _TorchScript Enums:
+.. _torchscript-enums:
+
+TorchScript Enums
+^^^^^^^^^^^^^^^^^^^
+
+Python enums can be used in TorchScript without any extra annotation or code:
+
+::
+
+    from enum import Enum
+
+
+    class Color(Enum):
+        RED = 1
+        GREEN = 2
+
+    @torch.jit.script
+    def enum_fn(x: Color, y: Color) -> bool:
+        if x == Color.RED:
+            return True
+
+        return x == y
+
+After an enum is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type. The type of the values of an enum must be ``int``,
+``float``, or ``str``. All values must be of the same type; heterogenous types for enum
+values are not supported.
+
+
 Named Tuples
 ^^^^^^^^^^^^
 Types produced by :func:`collections.namedtuple <collections.namedtuple>` can be used in TorchScript.
diff --git a/docs/source/jit_unsupported.rst b/docs/source/jit_unsupported.rst
index 8bf3e78d672a..7368abad1e30 100644
--- a/docs/source/jit_unsupported.rst
+++ b/docs/source/jit_unsupported.rst
@@ -87,6 +87,5 @@ we suggest using :meth:`torch.jit.trace`.
   * :class:`torch.nn.RNN`
   * :class:`torch.nn.AdaptiveLogSoftmaxWithLoss`
   * :class:`torch.autograd.Function`
-  * :class:`torch.autograd.no_grad`
   * :class:`torch.autograd.enable_grad`
   * :class:`torch.Generator`
diff --git a/docs/source/name_inference.rst b/docs/source/name_inference.rst
index 7fc84e092633..ccbb8c0c54d3 100644
--- a/docs/source/name_inference.rst
+++ b/docs/source/name_inference.rst
@@ -197,6 +197,8 @@ If you don't see an operation listed here, but it would help your use case, plea
    :meth:`Tensor.sigmoid_`,None
    ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc`
    :meth:`Tensor.sign_`,None
+   ":meth:`Tensor.sgn`, :func:`torch.sgn`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sgn_`,None
    ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc`
    :meth:`Tensor.sin_`,None
    ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc`
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index eb88b50e6d56..416121cec8d6 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -483,6 +483,11 @@ Loss functions
 
 .. autofunction:: triplet_margin_loss
 
+:hidden:`triplet_margin_with_distance_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: triplet_margin_with_distance_loss
+
 Vision functions
 ----------------
 
@@ -533,5 +538,3 @@ DataParallel functions (multi-GPU, distributed)
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: torch.nn.parallel.data_parallel
-
-
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 3a6cb7e19316..8d195c04037c 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -10,7 +10,7 @@ These are the basic building block for graphs
     :depth: 2
     :local:
     :backlinks: top
-    
+
 
 .. currentmodule:: torch.nn
 
@@ -269,6 +269,7 @@ Loss Functions
     nn.CosineEmbeddingLoss
     nn.MultiMarginLoss
     nn.TripletMarginLoss
+    nn.TripletMarginWithDistanceLoss
 
 Vision Layers
 ----------------
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 230426be8695..a34b0d7231fb 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -214,6 +214,10 @@ complete snapshot of the memory allocator state via
 :meth:`~torch.cuda.memory_snapshot`, which can help you understand the
 underlying allocation patterns produced by your code.
 
+Use of a caching allocator can interfere with memory checking tools such as
+``cuda-memcheck``.  To debug memory errors using ``cuda-memcheck``, set
+``PYTORCH_NO_CUDA_MEMORY_CACHING=1`` in your environment to disable caching.
+
 .. _cufft-plan-cache:
 
 cuFFT plan cache
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index ea45a2d7070a..3c07486b0e89 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -231,6 +231,25 @@ The dynamic control flow is captured correctly. We can verify in backends with d
     #       [37, 37, 37]], dtype=int64)]
 
 
+To avoid exporting a variable scalar tensor as a fixed value constant as part of the ONNX model, please
+avoid use of ``torch.Tensor.item()``. Torch supports implicit cast of single-element tensors to numbers.
+E.g.: ::
+
+    class LoopModel(torch.nn.Module):
+        def forward(self, x, y):
+            res = []
+            arr = x.split(2, 0)
+            for i in range(int(y)):
+                res += [arr[i].sum(0, False)]
+            return torch.stack(res)
+
+    model = torch.jit.script(LoopModel())
+    inputs = (torch.randn(16), torch.tensor(8))
+
+    out = model(*inputs)
+    torch.onnx.export(model, inputs, 'loop_and_list.onnx', opset_version=11, example_outputs=out)
+
+
 TorchVision support
 -------------------
 
@@ -262,6 +281,7 @@ The following operators are supported:
 * Conv
 * Dropout
 * Embedding (no optional arguments supported)
+* EmbeddingBag
 * FeatureDropout (training mode not supported)
 * Index
 * MaxPool1d
@@ -289,6 +309,7 @@ The following operators are supported:
 * avg_pool2d
 * avg_pool2d
 * avg_pool3d
+* as_strided
 * baddbmm
 * bitshift
 * cat
@@ -314,6 +335,7 @@ The following operators are supported:
 * exp
 * expand
 * expand_as
+* eye
 * flatten
 * floor
 * floor_divide
@@ -335,9 +357,11 @@ The following operators are supported:
 * instance_norm
 * interpolate
 * isnan
+* KLDivLoss
 * layer_norm
 * le
 * leaky_relu
+* len
 * log
 * log1p
 * log2
@@ -358,6 +382,9 @@ The following operators are supported:
 * narrow
 * ne
 * neg
+* new_empty
+* new_full
+* new_zeros
 * nll_loss
 * nonzero
 * norm
@@ -811,7 +838,10 @@ Q: Is tensor list exportable to ONNX?
 
   Yes, this is supported now for ONNX opset version >= 11. ONNX introduced the concept of Sequence in opset 11.
   Similar to list, Sequence is a data type that contains arbitrary number of Tensors.
-  Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. E.g.: ::
+  Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc.
+  However, in-place list append within loops is not exportable to ONNX. To implement this, please use inplace
+  add operator.
+  E.g.: ::
 
     class ListLoopModel(torch.nn.Module):
         def forward(self, x):
@@ -820,8 +850,8 @@ Q: Is tensor list exportable to ONNX?
             arr = x.split(2, 0)
             res2 = torch.zeros(3, 4, dtype=torch.long)
             for i in range(len(arr)):
-                res = res.append(arr[i].sum(0, False))
-                res1 = res1.append(arr[-1 - i].sum(0, False))
+                res += [arr[i].sum(0, False)]
+                res1 += [arr[-1 - i].sum(0, False)]
                 res2 += 1
             return torch.stack(res), torch.stack(res1), res2
 
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index b597fa9f51f3..b78ed2c08586 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -77,6 +77,261 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
     ``torch.backends.quantized.engine = 'qnnpack'``
 
+Quantization API Summary
+---------------------------------------
+
+There are three types of quantization supported in PyTorch:
+
+1. dynamic quantization (weights quantized with activations read/stored in
+   floating point and quantized for compute.)
+2. static quantization (weights quantized, activations quantized, calibration
+   required post training)
+3. quantization aware training (weights quantized, activations quantized,
+   quantization numerics modeled during training)
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Dynamic Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+This is the simplest to apply form of quantization where the weights are
+quantized ahead of time but the activations are dynamically quantized
+during inference. This is used for situations where the model execution time
+is dominated by loading weights from memory rather than computing the matrix
+multiplications. This is true for for LSTM and Transformer type models with
+small batch size.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                   /
+  linear_weight_fp32
+
+  # dynamically quantized model
+  # linear and conv weights are in int8
+  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
+                       /
+     linear_weight_int8
+
+API example::
+
+    import torch
+
+    # define a floating point model
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(M, self).__init__()
+            self.fc = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+    # create a model instance
+    model_fp32 = M()
+    # create a quantized model instance
+    model_int8 = torch.quantization.quantize_dynamic(
+        model_fp32,  # the original model
+        {torch.nn.Linear},  # a set of layers to dynamically quantize
+        dtype=torch.qint8)  # the target dtype for quantized weights
+
+    # run the model
+    input_fp32 = torch.randn(4, 4, 4, 4)
+    res = model_int8(input_fp32)
+
+To learn more about dynamic quantization please see our `dynamic quantization tutorial
+<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
+
+Static Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+Static quantization quantizes the weights and activations of the model.  It
+fuses activations into preceding layers where possible.  It requires
+calibration with a representative dataset to determine optimal quantization
+parameters for activations. Post Training Quantization is typically used when
+both memory bandwidth and compute savings are important with CNNs being a
+typical use case.  Static quantization is also known as Post Training
+Quantization or PTQ.
+
+Diagram::
+
+    # original model
+    # all tensors and computations are in floating point
+    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+        linear_weight_fp32
+
+    # statically quantized model
+    # weights and activations are in int8
+    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                        /
+      linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could be statically quantized
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # manually specify where tensors will be converted from floating
+          # point to quantized in the quantized model
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.relu(x)
+          # manually specify where tensors will be converted from quantized
+          # to floating point in the quantized model
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval mode for static quantization logic to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+
+  # Fuse the activations to preceding layers, where applicable.
+  # This needs to be done manually depending on the model architecture.
+  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+
+  # Prepare the model for static quantization. This inserts observers in
+  # the model that will observe activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+
+  # calibrate the prepared model to determine quantization parameters for activations
+  # in a real world setting, the calibration would be done with a representative dataset
+  input_fp32 = torch.randn(4, 1, 4, 4)
+  model_fp32_prepared(input_fp32)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, and replaces key operators with quantized
+  # implementations.
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about static quantization, please see the `static quantization tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Quantization Aware Training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Quantization Aware Training models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods.  During
+training, all calculations are done in floating point, with fake_quant modules
+modeling the effects of quantization by clamping and rounding to simulate the
+effects of INT8.  After model conversion, weights and
+activations are quantized, and activations are fused into the preceding layer
+where possible.  It is commonly used with CNNs and yields a higher accuracy
+compared to static quantization.  Quantization Aware Training is also known as
+QAT.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+      linear_weight_fp32
+
+  # model with fake_quants for modeling quantization numerics during training
+  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
+                             /
+     linear_weight_fp32 -- fq
+
+  # quantized model
+  # weights and activations are in int8
+  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                       /
+     linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could benefit from QAT
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.bn = torch.nn.BatchNorm2d(1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.bn(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to train mode for QAT logic to work
+  model_fp32.train()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
+      [['conv', 'bn', 'relu']])
+
+  # Prepare the model for QAT. This inserts observers and fake_quants in
+  # the model that will observe weight and activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
+
+  # run the training loop (not shown)
+  training_loop(model_fp32_prepared)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, fuses modules where appropriate,
+  # and replaces key operators with quantized implementations.
+  model_fp32_prepared.eval()
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about quantization aware training, please see the `QAT
+tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
 Quantized Tensors
 ---------------------------------------
 
@@ -121,79 +376,8 @@ cover typical CNN and RNN models
     torch.nn.quantized
     torch.nn.quantized.dynamic
 
-Quantization Workflows
-----------------------
-
-PyTorch provides three approaches to quantize models.
-
-.. _quantization tutorials:
-   https://pytorch.org/tutorials/#quantization-experimental
-
-1. Post Training Dynamic Quantization: This is the simplest to apply form of
-   quantization where the weights are quantized ahead of time but the
-   activations are dynamically quantized  during inference. This is used
-   for situations where the model execution time is dominated by loading
-   weights from memory rather than computing the matrix multiplications.
-   This is true for for LSTM and Transformer type models with small
-   batch size. Applying dynamic quantization to a whole model can be
-   done with a single call to :func:`torch.quantization.quantize_dynamic()`.
-   See the `quantization tutorials`_
-2. Post Training Static Quantization: This is the most commonly used form of
-   quantization where the weights are quantized ahead of time and the
-   scale factor and bias for the activation tensors is pre-computed
-   based on observing the behavior of the model during a calibration
-   process. Post Training Quantization is typically when both memory bandwidth
-   and compute savings are important with CNNs being a typical use case.
-   The general process for doing post training quantization is:
-
-
-
-   1. Prepare the model:
-
-      a. Specify where the activations are quantized and dequantized explicitly
-         by adding QuantStub and DeQuantStub modules.
-      b. Ensure that modules are not reused.
-      c. Convert any operations that require requantization into modules
-
-   2. Fuse operations like conv + relu or conv+batchnorm + relu together to
-      improve both model accuracy and performance.
-
-   3. Specify the configuration of the quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or
-      L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare` to insert modules
-      that will observe activation tensors during calibration
-   5. Calibrate the model by running inference against a calibration
-      dataset
-   6. Finally, convert the model itself with the
-      torch.quantization.convert() method. This does several things: it
-      quantizes the weights, computes and stores the scale and bias
-      value to be used each activation tensor, and replaces key
-      operators quantized implementations.
-
-   See the `quantization tutorials`_
-
-
-3. Quantization Aware Training: In the rare cases where post training
-   quantization does not provide adequate accuracy training can be done
-   with simulated quantization using the
-   :class:`torch.quantization.FakeQuantize`. Computations will take place in
-   FP32 but with values clamped and rounded to simulate the effects of INT8
-   quantization. The sequence of steps is very similar.
-
-
-   1. Steps (1) and (2) are identical.
-
-   3. Specify the configuration of the fake quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or Moving Average
-      or L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare_qat` to insert modules
-      that will simulate quantization during training.
-   5. Train or fine tune the model.
-   6. Identical to step (6) for post training quantization
-
-   See the `quantization tutorials`_
-
+Quantization Customizations
+---------------------------
 
 While default implementations of observers to select the scale factor and bias
 based on observed tensor data are provided, developers can provide their own
@@ -218,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module
 by module basis. Specifically, for all quantization techniques, the user needs to:
 
 1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form.
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
 2. Specify which parts of the model need to be quantized either by assigning
-   ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:
@@ -238,6 +428,13 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
+Best Practices
+--------------
+
+1. Set the ``reduce_range`` argument on observers to `True` if you are using the
+   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
+   by reducing the range of quantized data type by 1 bit.
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------
diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index 37adc14faae1..1d786710d15c 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -113,8 +113,6 @@ and move it to the desired devices on the callee if necessary.
 The RPC package also provides decorators which allow applications to specify
 how a given function should be treated on the callee side.
 
-.. warning::
-  The ``rpc.functions`` package is a prototype feature and subject to change.
 
 .. autofunction:: torch.distributed.rpc.functions.async_execution
 
@@ -142,9 +140,6 @@ to configure the backend's behavior.
 TensorPipe Backend
 """"""""""""""""""
 
-.. warning::
-    The TensorPipe backend is a **beta feature**.
-
 The TensorPipe agent, which is the default, leverages `the TensorPipe library
 <https://github.com/pytorch/tensorpipe>`_, which provides a natively
 point-to-point communication primitive specifically suited for machine learning
@@ -192,6 +187,10 @@ Example::
 Process Group Backend
 """""""""""""""""""""
 
+.. warning ::
+     The Process Group Backend will be deprecated soon, we recommend using the
+     TensorPipe Backend instead.
+
 The Process Group agent instantiates a process group from
 the :mod:`~torch.distributed` module and utilizes its point-to-point
 communication capabilities to send RPC messages. Internally, the process
@@ -293,8 +292,13 @@ The RRef design note covers the design of the :ref:`rref` (Remote REFerence) pro
 
 Tutorials
 ---------
-The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs.
+The RPC tutorials introduce users to the RPC framework, provide several example applications
+using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs, and demonstrate how
+to use `the profiler <https://pytorch.org/docs/stable/autograd.html#profiler>`__ to profile RPC-based workloads.
 
 -  `Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
 -  `Implementing a Parameter Server using Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html>`__
 -  `Combining Distributed DataParallel with Distributed RPC Framework <https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html>`__
+-  `Profiling RPC-based Workloads <https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html>`__
+-  `Implementing batch RPC processing <https://pytorch.org/tutorials/intermediate/rpc_async_execution.html>`__
+-  `Distributed Pipeline Parallel <https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html>`__
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index d7a94711e76b..94b1fb25f58e 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -8,7 +8,7 @@ torch.Tensor
 A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of
 a single data type.
 
-Torch defines 10 tensor types with CPU and GPU variants:
+Torch defines 10 tensor types with CPU and GPU variants which are as follows:
 
 ========================== ===========================================   ============================= ================================
 Data type                  dtype                                         CPU tensor                    GPU tensor
@@ -32,7 +32,7 @@ Boolean                    ``torch.bool``                                :class:
   Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10
   significand bits. Useful when precision is important at the expense of range.
 .. [2]
-  Sometimes referred to as Brain Floating Point: use 1 sign, 8 exponent and 7
+  Sometimes referred to as Brain Floating Point: uses 1 sign, 8 exponent, and 7
   significand bits. Useful when range is important, since it has the same
   number of exponent bits as ``float32``
 
@@ -453,6 +453,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: narrow
    .. automethod:: narrow_copy
    .. automethod:: ndimension
+   .. automethod:: nan_to_num
+   .. automethod:: nan_to_num_
    .. automethod:: ne
    .. automethod:: ne_
    .. automethod:: not_equal
@@ -532,6 +534,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: sign
    .. automethod:: sign_
    .. automethod:: signbit
+   .. automethod:: sgn
+   .. automethod:: sgn_
    .. automethod:: sin
    .. automethod:: sin_
    .. automethod:: sinh
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index beab6c449df1..d0537947d4ff 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -312,6 +312,7 @@ Pointwise Ops
     mul
     multiply
     mvlgamma
+    nan_to_num
     neg
     negative
     nextafter
@@ -536,3 +537,4 @@ Utilities
     set_deterministic
     is_deterministic
     vmap
+    Assert
diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index 17e9fb26afa1..f74e2dc9f37e 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.6.0'
+    s.version          = '1.6.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'
diff --git a/mypy.ini b/mypy.ini
index a7d4acea9571..ea7bdb1a83ed 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -53,45 +53,24 @@ ignore_errors = True
 [mypy-torch.distributed.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.codegen.*]
-ignore_errors = True
-
-[mypy-torch.testing._internal.autocast_test_lists.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.hypothesis_utils.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.common_methods_invocations.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.common_nn.*]
 ignore_errors = True
 
 [mypy-torch.testing._internal.common_quantization.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.common_utils.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.generated.*]
 ignore_errors = True
 
 [mypy-torch.testing._internal.distributed.*]
 ignore_errors = True
 
-[mypy-torch.quantization.observer]
-ignore_errors = True
-
 [mypy-torch.quantization.stubs]
 ignore_errors = True
 
-[mypy-torch.quantization.fake_quantize]
-ignore_errors = True
-
-[mypy-torch.quantization.quantize_jit]
-ignore_errors = True
-
 [mypy-torch.quantization._numeric_suite]
 ignore_errors = True
 
@@ -102,15 +81,9 @@ ignore_errors = True
 [mypy-torch.quantization.fx.*]
 ignore_errors = True
 
-[mypy-torch.quasirandom]
-ignore_errors = True
-
 [mypy-torch.distributions.*]
 ignore_errors = True
 
-[mypy-torch.tensor]
-ignore_errors = True
-
 [mypy-torch._tensor_str]
 ignore_errors = True
 
@@ -159,21 +132,6 @@ ignore_errors = True
 [mypy-torch.nn.parallel.comm]
 ignore_errors = True
 
-[mypy-torch.nn.quantized.functional]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.activation]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.normalization]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.utils]
-ignore_errors = True
-
 [mypy-torch.nn.qat.modules.activations]
 ignore_errors = True
 
@@ -186,21 +144,9 @@ ignore_errors = True
 [mypy-torch.nn.quantized.modules.conv]
 ignore_errors = True
 
-[mypy-torch.nn.quantized.modules.functional_modules]
-ignore_errors = True
-
 [mypy-torch.cuda]
 ignore_errors = True
 
-[mypy-torch.cuda.amp.*]
-ignore_errors = True
-
-[mypy-torch.cuda.comm]
-ignore_errors = True
-
-[mypy-torch.cuda.nccl]
-ignore_errors = True
-
 [mypy-torch._lobpcg]
 ignore_errors = True
 
@@ -222,12 +168,6 @@ ignore_errors = True
 [mypy-torch.contrib._tensorboard_vis]
 ignore_errors = True
 
-[mypy-torch.utils.data._utils.worker]
-ignore_errors = True
-
-[mypy-torch.utils.data.distributed]
-ignore_errors = True
-
 [mypy-torch.nn.utils.prune]
 ignore_errors = True
 
diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py
index 0fac6d20d4d4..9121c5ebf0db 100644
--- a/scripts/get_python_cmake_flags.py
+++ b/scripts/get_python_cmake_flags.py
@@ -12,9 +12,9 @@
 #   make
 #
 
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from __future__ import print_function
+
+
+
 from distutils import sysconfig
 import sys
 
diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 77f9c8b9f16e..8b6fc6c4cf63 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -70,4 +70,6 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test2* ]]; then
     pytest "${args[@]}" \
       "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i"
   done
+  pytest "${args[@]}" \
+    "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset12_onnx_shape_inference"
 fi
diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb
index 801ad34a64fd..810c23352fdd 100644
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@@ -62,10 +62,13 @@
 project.save
 
 sdk = nil
+arch = nil
 if options[:platform] == 'SIMULATOR'
     sdk = 'iphonesimulator'
+    arch = 'x86_64'
 elsif options[:platform] == 'OS'
     sdk = 'iphoneos'
+    arch = 'arm64'
 else
     raise "unsupported platform #{options[:platform]}"
 end
@@ -76,4 +79,5 @@
 end
 
 # run xcodebuild
-exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile}"
+exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile} -arch #{arch}" 
+
diff --git a/setup.py b/setup.py
index 2a2f911e0d3d..c29ee929b8ca 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,9 @@
 #   BUILD_CAFFE2_OPS=0
 #     disable Caffe2 operators build
 #
+#   BUILD_CAFFE2=0
+#     disable Caffe2 build
+#
 #   USE_IBVERBS
 #     toggle features related to distributed support
 #
@@ -162,7 +165,7 @@
 #      When turned on, the following cmake variables will be toggled as well:
 #        USE_SYSTEM_CPUINFO=ON USE_SYSTEM_SLEEF=ON BUILD_CUSTOM_PROTOBUF=OFF
 
-from __future__ import print_function
+
 import sys
 if sys.version_info < (3,):
     print("Python 2 has reached end-of-life and is no longer supported by PyTorch.")
@@ -340,7 +343,11 @@ def check_file(f):
 ################################################################################
 
 # the list of runtime dependencies required by this built package
-install_requires = ['future', 'typing_extensions', 'dataclasses']
+install_requires = [
+    'future',
+    'typing_extensions',
+    'dataclasses; python_version < "3.7"'
+]
 
 missing_pydep = '''
 Missing build dependency: Unable to `import {importname}`.
@@ -776,6 +783,10 @@ def print_box(msg):
                 'include/ATen/detail/*.h',
                 'include/ATen/native/*.h',
                 'include/ATen/native/cpu/*.h',
+                'include/ATen/native/cuda/*.h',
+                'include/ATen/native/cuda/*.cuh',
+                'include/ATen/native/hip/*.h',
+                'include/ATen/native/hip/*.cuh',
                 'include/ATen/native/quantized/*.h',
                 'include/ATen/native/quantized/cpu/*.h',
                 'include/ATen/quantized/*.h',
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index d5cbe5a884a9..a2f843d78f72 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -1,4 +1,3 @@
-
 import argparse
 import datetime
 import re
@@ -28,6 +27,8 @@
 # NB: function name DOES NOT include overload name!
 allow_list = [
     ("c10_experimental", datetime.date(2222, 1, 1)),
+    # Internal
+    ("static", datetime.date(9999, 1, 1)),
     # Internal, profiler-specific ops
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),
@@ -58,16 +59,16 @@
     ("aten::atan2", datetime.date(2020, 7, 30)),
     ("aten::copy_", datetime.date(2020, 7, 30)),
     ("aten::sort", datetime.date(2020, 7, 30)),
-    ('aten::_convolution', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose', datetime.date(2020, 10, 15)),
-    ('aten::_convolution_double_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward_input', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward_weight', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward_input', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward_weight', datetime.date(2020, 10, 15)),
+    ("aten::_convolution", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose", datetime.date(2020, 10, 15)),
+    ("aten::_convolution_double_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward_input", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward_weight", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward_input", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward_weight", datetime.date(2020, 10, 15)),
     ("aten::_cudnn_init_dropout_state", datetime.date(2020, 7, 30)),
     ("aten::sparse_coo_tensor", datetime.date(2020, 7, 30)),
     ("aten::_sparse_coo_tensor_with_dims", datetime.date(2020, 7, 30)),
@@ -90,6 +91,7 @@
     ("aten::logspace", datetime.date(2020, 9, 30)),
     ("aten::logspace.out", datetime.date(2020, 9, 30)),
     ("__getstate__", datetime.date(2020, 9, 11), "Conv[23]dPackedParams"),
+    ("_caffe2::LearningRate", datetime.date(2020, 10, 1)),
     ("aten::_var", datetime.date(2020, 10, 1)),
     ("aten::_std", datetime.date(2020, 10, 1)),
     ("aten::_foreach_add_", datetime.date(2020, 10, 1)),
@@ -99,6 +101,16 @@
     ("preprocess", datetime.date(2020, 10, 1)),
     ("compile", datetime.date(2020, 10, 1)),
     ("execute", datetime.date(2020, 10, 1)),
+    ("aten::_addr", datetime.date(2020, 10, 31)),
+    ("aten::_addr_", datetime.date(2020, 10, 31)),
+    ("aten::_addr.out", datetime.date(2020, 10, 31)),
+    ("aten::_foreach_add", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_div", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
+    ("aten::_amp_non_finite_check_and_unscale_", datetime.date(9999, 1, 1)),
+    ("aten::choose_qparams_optimized", datetime.date(2020, 10, 5)),
+    ("aten::smooth_l1_loss_backward", datetime.date(2020, 10, 15)),
 ]
 
 
@@ -115,6 +127,7 @@ def allow_listed(schema, allow_list):
             return True
     return False
 
+
 # The nightly will fail to parse newly added syntax to schema declarations
 # Add new schemas that will fail the nightly here
 dont_parse_list = [
@@ -122,6 +135,7 @@ def allow_listed(schema, allow_list):
     ("test_backend", datetime.date(2099, 9, 17)),
 ]
 
+
 def dont_parse(schema_line):
     for item in dont_parse_list:
         if item[1] < datetime.date.today():
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 4efdb122efc8..707c1bfd7ac0 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -246,6 +246,18 @@ TEST_F(FunctionalTest, SmoothL1LossDefaultOptions) {
   ASSERT_TRUE(input.sizes() == input.grad().sizes());
 }
 
+TEST_F(FunctionalTest, SmoothL1LossBeta) {
+  auto input = torch::tensor({0.1, 1.5, 10.0}, torch::dtype(torch::kFloat).requires_grad(true));
+  auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
+  auto output =
+      F::smooth_l1_loss(input, target, /*reduction=*/torch::kMean, /*beta=*/0.5);
+  auto expected = torch::tensor(1.67, torch::kFloat);
+  auto s = output.sum();
+  s.backward();
+  ASSERT_TRUE(output.allclose(expected));
+  ASSERT_TRUE(input.sizes() == input.grad().sizes());
+}
+
 TEST_F(FunctionalTest, SmoothL1LossNoReduction) {
   auto input = torch::tensor({0.1, 1.2, 4.7}, torch::dtype(torch::kFloat).requires_grad(true));
   auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
@@ -670,6 +682,56 @@ TEST_F(FunctionalTest, TripletMarginLoss) {
   ASSERT_TRUE(output.allclose(expected, 1e-04));
 }
 
+TEST_F(FunctionalTest, TripletMarginWithDistanceLossDefaultParity) {
+  // Check that if we use torch::pairwise_distance with the default
+  // TripletMarginLoss options as our distance function, the outputs
+  // are equal (i.e., equal under defaults).
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& reduction : reductions) {
+    for (auto& margin : margins) {
+      for (const auto& swap : swaps) {
+        auto anchor = 
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto positive =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto negative =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+        auto basicOptions = F::TripletMarginLossFuncOptions()
+                                .reduction(reduction)
+                                .margin(margin)
+                                .swap(swap);
+        auto distanceOptions =
+            F::TripletMarginWithDistanceLossFuncOptions()
+                .reduction(reduction)
+                .margin(margin)
+                .swap(swap);
+        TripletMarginLoss basicLoss(basicOptions);
+        TripletMarginWithDistanceLoss distanceLoss(distanceOptions);
+
+        auto basicOutput =
+            F::triplet_margin_loss(anchor, positive, negative, basicOptions);
+        auto distanceOutput = F::triplet_margin_with_distance_loss(
+            anchor, positive, negative, distanceOptions);
+
+        ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6));
+
+        // handle for torch::kNone reduction
+        auto sum = distanceOutput.sum();
+        sum.backward();
+        ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
+        ASSERT_EQ(positive.sizes(), positive.grad().sizes());
+        ASSERT_EQ(negative.sizes(), negative.grad().sizes());
+      }
+    }
+  }
+}
+
 TEST_F(FunctionalTest, NLLLoss) {
   auto input = torch::tensor({{-0.1315, -3.1315, -2.5315},
                               {-3.7038, -0.1038, -2.6038},
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 4777cf0b54bc..ef0fc2765551 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2085,6 +2085,115 @@ TEST_F(ModulesTest, TripletMarginLoss) {
   ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
 }
 
+TEST_F(ModulesTest, TripletMarginWithDistanceLossDefaultParity) {
+  // Check that if we use torch::pairwise_distance with the default
+  // TripletMarginLoss options as our distance function, the outputs
+  // are equal (i.e., equal under defaults).
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& reduction : reductions) {
+    for (auto& margin : margins) {
+      for (const auto& swap : swaps) {
+        auto anchor = 
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto positive =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto negative =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+        auto basicOptions = TripletMarginLossOptions()
+                                .reduction(reduction)
+                                .margin(margin)
+                                .swap(swap);
+        auto distanceOptions =
+            TripletMarginWithDistanceLossOptions()
+                .reduction(reduction)
+                .margin(margin)
+                .swap(swap);
+        TripletMarginLoss basicLoss(basicOptions);
+        TripletMarginWithDistanceLoss distanceLoss(distanceOptions);
+
+        auto basicOutput = basicLoss->forward(anchor, positive, negative);
+        auto distanceOutput = distanceLoss->forward(anchor, positive, negative);
+        auto basicOperatorOutput = basicLoss(anchor, positive, negative);
+        auto distanceOperatorOutput = distanceLoss(anchor, positive, negative);
+
+        ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6));
+        ASSERT_TRUE(distanceOperatorOutput.allclose(distanceOutput, 1e-6, 1e-6));
+        ASSERT_TRUE(distanceOperatorOutput.allclose(basicOperatorOutput, 1e-6, 1e-6));
+
+        // handle for torch::kNone reduction
+        auto sum = distanceOutput.sum();
+        sum.backward();
+        ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
+        ASSERT_EQ(positive.sizes(), positive.grad().sizes());
+        ASSERT_EQ(negative.sizes(), negative.grad().sizes());
+      }
+    }
+  }
+}
+
+TEST_F(ModulesTest, TripletMarginWithDistanceLossFunctionalParity) {
+  // Check for parity between F::triplet_margin_with_distance_loss and
+  // TripletMarginWithDistanceLoss.
+  auto pairwise_distance = [&](const torch::Tensor& x, const torch::Tensor& y) {
+    return torch::pairwise_distance(x, y);
+  };
+  auto cosine_distance = [&](const torch::Tensor& x,
+                                 const torch::Tensor& y) {
+    return 1.0 - torch::cosine_similarity(x, y);
+  };
+  std::vector<TripletMarginWithDistanceLossOptions::distance_function_t>
+      distance_functions = {pairwise_distance, cosine_distance};
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& function : distance_functions) {
+    for (auto& reduction : reductions) {
+      for (auto& margin : margins) {
+        for (const auto& swap : swaps) {
+          auto moduleOptions =
+              TripletMarginWithDistanceLossOptions()
+                  .distance_function(function)
+                  .reduction(reduction)
+                  .margin(margin)
+                  .swap(swap);
+          auto functionOptions =
+              torch::nn::functional::TripletMarginWithDistanceLossFuncOptions()
+                  .distance_function(function)
+                  .reduction(reduction)
+                  .margin(margin)
+                  .swap(swap);
+
+          auto anchor = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+          auto positive = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+          auto negative = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+          TripletMarginWithDistanceLoss distanceLoss(moduleOptions);
+
+          auto moduleOutput = distanceLoss->forward(anchor, positive, negative);
+          auto moduleOperatorOutput = distanceLoss(anchor, positive, negative);
+          auto functionOutput = torch::nn::functional::triplet_margin_with_distance_loss(
+            anchor, positive, negative, functionOptions);
+
+          ASSERT_TRUE(moduleOutput.allclose(functionOutput, 1e-6, 1e-6));
+          ASSERT_TRUE(moduleOperatorOutput.allclose(functionOutput, 1e-6, 1e-6));
+        }
+      }
+    }
+  }
+}
+
 TEST_F(ModulesTest, NLLLoss) {
   NLLLoss loss;
   auto input = torch::tensor({{-0.1315, -3.1315, -2.5315},
@@ -3529,9 +3638,9 @@ TEST_F(ModulesTest, PrettyPrintIdentity) {
 }
 
 TEST_F(ModulesTest, PrettyPrintFlatten) {
-  ASSERT_EQ(c10::str(Flatten()), 
+  ASSERT_EQ(c10::str(Flatten()),
     "torch::nn::Flatten(start_dim=1, end_dim=-1)");
-  ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))), 
+  ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))),
     "torch::nn::Flatten(start_dim=2, end_dim=4)");
 }
 
@@ -4394,6 +4503,20 @@ TEST_F(ModulesTest, PrettyPrintTripletMarginLoss) {
       "torch::nn::TripletMarginLoss(margin=3, p=2, eps=1e-06, swap=false)");
 }
 
+TEST_F(ModulesTest, PrettyPrintTripletMarginWithDistanceLoss) {
+  auto distanceOptions = TripletMarginWithDistanceLossOptions()
+                             .distance_function([&](const torch::Tensor& x,
+                                                    const torch::Tensor& y) {
+                               return torch::pairwise_distance(x, y, 2.0, 1e-6);
+                             })
+                             .margin(1.5)
+                             .swap(true)
+                             .reduction(torch::kMean);
+  ASSERT_EQ(
+      c10::str(TripletMarginWithDistanceLoss(distanceOptions)),
+      "torch::nn::TripletMarginWithDistanceLoss(margin=1.5, swap=true)");
+}
+
 TEST_F(ModulesTest, PrettyPrintNLLLoss) {
   ASSERT_EQ(
       c10::str(NLLLoss()), "torch::nn::NLLLoss()");
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 5d23602881f0..9969c63e16d5 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED)
+if(USE_DISTRIBUTED AND NOT WIN32)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 84f7193ad8c0..2e22cd646813 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -2,7 +2,10 @@ set(JIT_TEST_ROOT ${TORCH_ROOT}/test/cpp/jit)
 
 # Build separate libraries the define custom classes/operators used from our Python tests.
 # These are intended to be used with torch.ops.load_library() in our Python test suite.
-add_library(torchbind_test SHARED ${JIT_TEST_ROOT}/test_custom_class.cpp)
+add_library(torchbind_test SHARED
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.h
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp
+)
 target_link_libraries(torchbind_test torch)
 
 add_library(jitbackend_test SHARED ${JIT_TEST_ROOT}/test_backend.cpp)
@@ -16,12 +19,9 @@ endif()
 
 # Build the cpp gtest binary containing the cpp-only tests.
 set(JIT_TEST_SRCS
-  ${JIT_TEST_ROOT}/gtest.cpp
   ${JIT_TEST_ROOT}/test_alias_analysis.cpp
   ${JIT_TEST_ROOT}/test_argument_spec.cpp
   ${JIT_TEST_ROOT}/test_autodiff.cpp
-  ${JIT_TEST_ROOT}/test_base.cpp
-  ${JIT_TEST_ROOT}/test_base.h
   ${JIT_TEST_ROOT}/test_class_import.cpp
   ${JIT_TEST_ROOT}/test_class_parser.cpp
   ${JIT_TEST_ROOT}/test_class_type.cpp
@@ -30,6 +30,8 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_cleanup_passes.cpp
   ${JIT_TEST_ROOT}/test_create_autodiff_subgraphs.cpp
   ${JIT_TEST_ROOT}/test_custom_class.cpp
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.h
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp
   ${JIT_TEST_ROOT}/test_custom_operators.cpp
   ${JIT_TEST_ROOT}/test_dce.cpp
   ${JIT_TEST_ROOT}/test_fuser.cpp
@@ -95,8 +97,6 @@ elseif(USE_ROCM)
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
 
-  target_link_libraries(test_jit PRIVATE caffe2_gpu)
-
   target_compile_definitions(test_jit PRIVATE USE_ROCM)
 endif()
 
diff --git a/test/cpp/jit/README.md b/test/cpp/jit/README.md
index a3e92403201f..ef5ea2d910be 100644
--- a/test/cpp/jit/README.md
+++ b/test/cpp/jit/README.md
@@ -1,69 +1,44 @@
 # JIT C++ Tests
 
-## How to add a new test
+## Adding a new test
 First, create a new test file. Test files should have be placed in this
 directory, with a name that starts with `test_`, like `test_foo.cpp`.
 
-Here is an example test file you can copy-paste.
+In general a single test suite
+
+Add your test file to the `JIT_TEST_SRCS` list in `test/cpp/jit/CMakeLists.txt`.
+
+A test file may look like:
 ```cpp
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
+using namespace ::torch::jit
 
-// 1. Test cases are void() functions.
-// 2. They start with the prefix `test`
-void testCaseOne() {
-    // ...
+TEST(FooTest, BarBaz) {
+   // ...
 }
 
-void testCaseTwo() {
-    // ...
-}
-}
+// Append '_CUDA' to the test case name will automatically filter it out if CUDA
+// is not compiled.
+TEST(FooTest, NeedsAGpu_CUDA) {
+   // ...
 }
-```
 
-Then, register your test in `tests.h`:
-```cpp
-// Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests
-#define TH_FORALL_TESTS(_)             \
-  _(ADFormulas)                        \
-  _(Attributes)                        \
-  ...
-  _(CaseOne)  // note that the `test` prefix is omitted.
-  _(CaseTwo)
-```
-
-We glob all the test files together in `CMakeLists.txt` so that you don't
-have to edit it every time you add a test. Unfortunately, this means that in
-order to get the build to pick up your new test file, you need to re-run
-cmake:
-```
-python setup.py build --cmake
+// Similarly, if only one GPU is detected, tests with `_MultiCUDA` at the end
+// will not be run.
+TEST(FooTest, NeedsMultipleGpus_MultiCUDA) {
+   // ...
+}
 ```
 
-## Why do we have two different test runners?
-We have two different ways of running our cpp tests:
-1. With `gtest`, from a standalone binary.
-2. With Python, from `TestJit.test_cpp` and `TestJit.test_cpp_cuda` (in
-   `test/test_jit.py`)
-
-We want both because we need to test things from a pure-C++ environment and
-with all our various Python patch-points enabled.
-
-## How do I run the tests?
+## Building and running the tests
 The following commands assume you are in PyTorch root.
 
-1. With `gtest`:
-   ```bash
-   # (re)build the test binary
-   ninja build/bin/test_jit
-   # run
-   build/bin/test_jit --gtest_filter='glob_style_filter*'
-   ```
-2. With Python:
-   ```
-   python test/test_jit.py TestJit.test_cpp TestJit.test_cpp_cuda
-   ```
+```bash
+# ... Build PyTorch from source, e.g.
+python setup.py develop
+# (re)build just the binary
+ninja -C build bin/test_jit
+# run tests
+build/bin/test_jit --gtest_filter='glob_style_filter*'
+```
diff --git a/test/cpp/jit/gtest.cpp b/test/cpp/jit/gtest.cpp
deleted file mode 100644
index e0e512be4352..000000000000
--- a/test/cpp/jit/gtest.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <test/cpp/jit/tests.h>
-
-#include <gtest/gtest.h>
-
-namespace torch {
-namespace jit {
-
-#define JIT_GTEST(name) \
-  TEST(JitTest, name) { \
-    test##name();       \
-  }
-TH_FORALL_TESTS(JIT_GTEST)
-#undef JIT_TEST
-
-#define JIT_GTEST_CUDA(name)   \
-  TEST(JitTest, name##_CUDA) { \
-    test##name();              \
-  }
-TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA)
-#undef JIT_TEST_CUDA
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index e854113a7a87..e700ee540616 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -1238,6 +1238,32 @@ TEST(AliasRegistrationTest, PureWithAnnotationsShouldError) {
       "Tried to register operator foo::rand11(Tensor(a) arg1) -> (Tensor(a)) with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA");
 }
 
+TEST(AliasRegistrationTest, AliasMoveAtenListOp) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  auto graph_string = R"IR(
+  graph():
+    %x : Tensor = prim::MakeTestTensor()
+    %8 : int = prim::Constant[value=0]()
+    %5 : int = prim::Constant[value=1]()
+    %4 : int = prim::Constant[value=2]()
+    %y : Tensor[] = prim::ListConstruct(%x)
+    %6 : Tensor = aten::add_(%x, %4, %5)
+    %9 : Tensor = aten::cat(%y, %8)
+    return (%9))IR";
+
+  torch::jit::parseIR(graph_string, graph.get(), vmap);
+  AliasDb aliasDb(graph);
+
+  // bc y.1 has a single used in a single non-aliasing aten op,
+  // x is added to y.1 contained elements instead of wildcard set
+  EXPECT_TRUE(!aliasDb.mayAlias(vmap["x"], vmap["9"]));
+
+  // write to contained element should prevent move
+  EXPECT_TRUE(!aliasDb.moveBeforeTopologicallyValid(
+      vmap["y"]->node(), vmap["9"]->node()));
+}
+
 TEST(AliasRegistrationTest, PureWithAnnotationsShouldError2) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand12(Tensor(a) arg1) -> Tensor(b)",
diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp
index 01e27caac05f..bf40761fc468 100644
--- a/test/cpp/jit/test_argument_spec.cpp
+++ b/test/cpp/jit/test_argument_spec.cpp
@@ -1,3 +1,5 @@
+#include <gtest/gtest.h>
+
 #include <torch/jit.h>
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/runtime/argument_spec.h"
@@ -5,6 +7,8 @@
 namespace torch {
 namespace jit {
 
+namespace {
+
 int device(const autograd::Variable& v) {
   return v.device().is_cuda() ? v.get_device() : -1;
 }
@@ -38,8 +42,9 @@ autograd::Variable var(
 autograd::Variable undef() {
   return autograd::Variable();
 }
+} // namespace
 
-void testCompleteArgumentSpec() {
+TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) {
   auto const CF = at::CPU(at::kFloat);
   auto const CD = at::CPU(at::kDouble);
   auto const GF = at::CUDA(at::kFloat);
@@ -94,34 +99,35 @@ void testCompleteArgumentSpec() {
   ASSERT_EQ(with_const.at(2).sizes().size(), 2);
 }
 
-size_t hashCode(const TensorTypePtr& ptr) {
-  return std::hash<TensorType>()(*ptr.get());
-}
+// TODO: this test was disabled for unknown reasons and doesn't run.
+// static size_t hashCode(const TensorTypePtr& ptr) {
+//   return std::hash<TensorType>()(*ptr.get());
+// }
 
-void testProfiledTensorTypeHashing() {
-  c10::VaryingShape<int64_t> vs(c10::optional<size_t>{});
-  auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false);
-  auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false);
-  ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2));
+// TEST(ArgumentSpecTest, VaryingShape) {
+//   c10::VaryingShape<int64_t> vs(c10::optional<size_t>{});
+//   auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false);
+//   auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false);
+//   ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2));
 
-  c10::VaryingShape<int64_t> vs22(std::vector<int64_t>{2, 2});
-  auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false);
-  auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false);
-  ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2));
+//   c10::VaryingShape<int64_t> vs22(std::vector<int64_t>{2, 2});
+//   auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false);
+//   auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false);
+//   ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2));
 
-  c10::VaryingShape<int64_t> vs23(std::vector<int64_t>{2, 3});
-  auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false);
-  ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2));
+//   c10::VaryingShape<int64_t> vs23(std::vector<int64_t>{2, 3});
+//   auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false);
+//   ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2));
 
-  auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true);
-  auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true);
-  ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true));
+//   auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true);
+//   auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true);
+//   ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true));
 
-  auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false);
-  ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false));
-}
+//   auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false);
+//   ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false));
+// }
 
-void testArgumentSpec() {
+TEST(ArgumentSpecTest, Basic_CUDA) {
   auto& CF = at::CPU(at::kFloat);
   auto& CD = at::CPU(at::kDouble);
   auto& GF = at::CUDA(at::kFloat);
diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index 7d431776a971..3993c63b1708 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -83,7 +84,7 @@ variable_list grad(
       fmap(inputs, get_edge));
 }
 
-void testADFormulas() {
+TEST(AutodiffTest, ADFormulas) {
   const auto cast = [](const Variable& v) {
     return static_cast<at::Tensor>(v);
   };
@@ -174,7 +175,7 @@ void testADFormulas() {
   }
 }
 
-void testDifferentiate() {
+TEST(AutodiffTest, Differentiate) {
   // Note: can't use IRParser for this test due to issue #23989
   auto graph = std::make_shared<Graph>();
   std::vector<int64_t> sizes{2, 3, 4};
@@ -229,7 +230,7 @@ void testDifferentiate() {
       ->run(*grad_spec.df);
 }
 
-void testDifferentiateWithRequiresGrad() {
+TEST(AutodiffTest, DifferentiateWithRequiresGrad) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
diff --git a/test/cpp/jit/test_base.cpp b/test/cpp/jit/test_base.cpp
deleted file mode 100644
index 338577fbd833..000000000000
--- a/test/cpp/jit/test_base.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
-
-#include "torch/csrc/jit/runtime/custom_operator.h"
-
-namespace torch {
-namespace jit {
-inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
-}
-
-namespace {
-RegisterOperators reg({
-    // This operator is intended to be used in JIT analysis and transformation
-    // pass unit tests in which Values with type Tensor are often required. It
-    // should not be used in situations in which the graph is actually executed
-    // because it always produces empty Tensors.
-    Operator(
-        "prim::MakeTestTensor() -> Tensor",
-        [](Stack* stack) { push(stack, at::Tensor()); },
-        aliasAnalysisFromSchema()),
-});
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h
deleted file mode 100644
index 54a59e445e95..000000000000
--- a/test/cpp/jit/test_base.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-
-// This file defines assertion macros that work in both gtest and non-gtest
-// builds, and has some common includes.
-#include "torch/csrc/jit/ir/ir.h"
-#include "torch/csrc/jit/runtime/operator.h"
-
-#if defined(USE_GTEST)
-#include <gtest/gtest.h>
-#include <test/cpp/common/support.h>
-#else
-#include "c10/util/Exception.h"
-#define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y))
-#define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y))
-#define ASSERT_TRUE TORCH_INTERNAL_ASSERT
-#define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
-#define ASSERT_THROWS_WITH(statement, substring)                         \
-  try {                                                                  \
-    (void)statement;                                                     \
-    ASSERT_TRUE(false);                                                  \
-  } catch (const std::exception& e) {                                    \
-    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
-  }
-#define ASSERT_ANY_THROW(statement)     \
-  {                                     \
-    bool threw = false;                 \
-    try {                               \
-      (void)statement;                  \
-    } catch (const std::exception& e) { \
-      threw = true;                     \
-    }                                   \
-    ASSERT_TRUE(threw);                 \
-  }
-
-#endif // defined(USE_GTEST)
-
-static inline bool isSandcastle() {
-  return (
-      (std::getenv("SANDCASTLE")) ||
-      (std::getenv("TW_JOB_USER") &&
-       std::string(std::getenv("TW_JOB_USER")) == "sandcastle"));
-}
diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp
index 82bc0cf3bccc..ffa845b3e2a8 100644
--- a/test/cpp/jit/test_class_import.cpp
+++ b/test/cpp/jit/test_class_import.cpp
@@ -1,7 +1,7 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
 #include <ATen/core/qualified_name.h>
+#include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/torch.h>
@@ -45,7 +45,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testClassImport() {
+TEST(ClassImportTest, Basic) {
   auto cu1 = std::make_shared<CompilationUnit>();
   auto cu2 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
@@ -80,7 +80,7 @@ void testClassImport() {
   ASSERT_FALSE(c);
 }
 
-void testScriptObject() {
+TEST(ClassImportTest, ScriptObject) {
   Module m1("m1");
   Module m2("m2");
   std::vector<at::IValue> constantTable;
@@ -114,7 +114,7 @@ def __init__(self, x):
     return x
 )JIT";
 
-void testClassDerive() {
+TEST(ClassImportTest, ClassDerive) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   const auto self = SimpleSelf(cls);
@@ -142,7 +142,7 @@ class FooBar1234(Module):
     return (self.f).top()
 )JIT";
 
-void testSaveLoadTorchbind() {
+TEST(ClassImportTest, CustomClass) {
   auto cu1 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
   // Import different versions of FooTest into two namespaces.
diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp
index 45e37103bb5a..2f7f06d3802b 100644
--- a/test/cpp/jit/test_class_parser.cpp
+++ b/test/cpp/jit/test_class_parser.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 
@@ -15,7 +16,7 @@ const auto testSource = R"JIT(
     an_attribute : Tensor
 )JIT";
 
-void testClassParser() {
+TEST(ClassParserTest, Basic) {
   Parser p(std::make_shared<Source>(testSource));
   std::vector<Def> definitions;
   std::vector<Resolver> resolvers;
diff --git a/test/cpp/jit/test_class_type.cpp b/test/cpp/jit/test_class_type.cpp
index c00aafcc526b..21229594d56d 100644
--- a/test/cpp/jit/test_class_type.cpp
+++ b/test/cpp/jit/test_class_type.cpp
@@ -1,11 +1,12 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/torch.h>
 
 namespace torch {
 namespace jit {
 
-void testClassTypeAddRemoveAttr() {
+TEST(ClassTypeTest, AddRemoveAttr) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   cls->addAttribute("attr1", TensorType::get(), true);
@@ -32,12 +33,12 @@ void testClassTypeAddRemoveAttr() {
   cls->addAttribute("attr1", IntType::get());
 }
 
-void testClassTypeAddRemoveConstant() {
+TEST(ClassTypeTest, AddRemoveConstant) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   cls->addConstant("const1", IValue(1));
   cls->addConstant("const2", IValue(2));
-  cls->addConstant("const3", IValue(2));
+  cls->addConstant("const3", IValue(3));
   ASSERT_EQ(cls->numConstants(), 3);
   ASSERT_TRUE(cls->hasConstant("const1"));
   ASSERT_TRUE(cls->hasConstant("const2"));
@@ -46,7 +47,7 @@ void testClassTypeAddRemoveConstant() {
 
   ASSERT_EQ(cls->getConstant("const1").toInt(), 1);
   ASSERT_EQ(cls->getConstant("const2").toInt(), 2);
-  ASSERT_EQ(cls->getConstant("const2").toInt(), 3);
+  ASSERT_EQ(cls->getConstant("const3").toInt(), 3);
 
   cls->unsafeRemoveConstant("const2");
   ASSERT_TRUE(cls->hasConstant("const1"));
diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp
index 2f2ca4e0a19b..38ceef932eb0 100644
--- a/test/cpp/jit/test_cleanup_passes.cpp
+++ b/test/cpp/jit/test_cleanup_passes.cpp
@@ -1,19 +1,19 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 namespace torch {
 namespace jit {
 
-void testCleanUpPasses() {
+TEST(CleanupPassTest, Basic) {
   // Tests stability of clean up passes when dealing with constant pooling
   // and constant propagation.
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond.1 : Tensor,
       %suffix.1 : str):
   %3 : bool = aten::Bool(%cond.1) # o.py:6:7
@@ -31,20 +31,19 @@ graph(%cond.1 : Tensor,
       -> (%12)
   return (%25)
   )IR",
-        &*graph);
-    runCleanupPasses(graph);
-    testing::FileCheck()
-        .check_count(
-            "prim::Constant[value=\"same string with a twist\"]",
-            1,
-            /*exactly=*/true)
-        ->run(*graph);
+      &*graph);
+  runCleanupPasses(graph);
+  testing::FileCheck()
+      .check_count(
+          "prim::Constant[value=\"same string with a twist\"]",
+          1,
+          /*exactly=*/true)
+      ->run(*graph);
 
-    auto graph_after_pass_once = graph->toString();
-    runCleanupPasses(graph);
-    auto graph_after_pass_twice = graph->toString();
-    ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
-  }
+  auto graph_after_pass_once = graph->toString();
+  runCleanupPasses(graph);
+  auto graph_after_pass_twice = graph->toString();
+  ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp
index e4d7d1ef856e..bf539e3d169f 100644
--- a/test/cpp/jit/test_code_template.cpp
+++ b/test/cpp/jit/test_code_template.cpp
@@ -1,6 +1,6 @@
-#include "test/cpp/jit/test_base.h"
-#include "test/cpp/jit/test_utils.h"
+#include <gtest/gtest.h>
 
+#include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/code_template.h"
 
 namespace torch {
@@ -33,31 +33,29 @@ static const auto ct_expect = R"(
   int notest(int a)
   )";
 
-void testCodeTemplate() {
-  {
-    TemplateEnv e;
-    e.s("hi", "foo");
-    e.v("what", {"is", "this"});
-    TemplateEnv c(e);
-    c.s("hi", "foo2");
-    ASSERT_EQ(e.s("hi"), "foo");
-    ASSERT_EQ(c.s("hi"), "foo2");
-    ASSERT_EQ(e.v("what")[0], "is");
-  }
+TEST(TestCodeTemplate, Copying) {
+  TemplateEnv e;
+  e.s("hi", "foo");
+  e.v("what", {"is", "this"});
+  TemplateEnv c(e);
+  c.s("hi", "foo2");
+  ASSERT_EQ(e.s("hi"), "foo");
+  ASSERT_EQ(c.s("hi"), "foo2");
+  ASSERT_EQ(e.v("what")[0], "is");
+}
 
-  {
-    TemplateEnv e;
-    e.v("args", {"hi", "8"});
-    e.v("bar", {"what\non many\nlines...", "7"});
-    e.s("a", "3");
-    e.s("b", "4");
-    e.v("stuff", {"things...", "others"});
-    e.v("empty", {});
-    auto s = ct.format(e);
-    // std::cout << "'" << s << "'\n";
-    // std::cout << "'" << ct_expect << "'\n";
-    ASSERT_EQ(s, ct_expect);
-  }
+TEST(TestCodeTemplate, Formatting) {
+  TemplateEnv e;
+  e.v("args", {"hi", "8"});
+  e.v("bar", {"what\non many\nlines...", "7"});
+  e.s("a", "3");
+  e.s("b", "4");
+  e.v("stuff", {"things...", "others"});
+  e.v("empty", {});
+  auto s = ct.format(e);
+  // std::cout << "'" << s << "'\n";
+  // std::cout << "'" << ct_expect << "'\n";
+  ASSERT_EQ(s, ct_expect);
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp
index b949c9a45b25..c8cb58e1886a 100644
--- a/test/cpp/jit/test_constant_pooling.cpp
+++ b/test/cpp/jit/test_constant_pooling.cpp
@@ -1,9 +1,10 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -11,26 +12,26 @@
 namespace torch {
 namespace jit {
 
-void testConstantPooling() {
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(ConstantPoolingTest, Int) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %8 : int = prim::Constant[value=1]()
   %10 : int = prim::Constant[value=1]()
   return (%8, %10)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingAcrossBlocks) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond : Tensor):
   %a : str = prim::Constant[value="bcd"]()
   %3 : bool = aten::Bool(%cond)
@@ -44,17 +45,18 @@ graph(%cond : Tensor):
   %7 : (str, str) = prim::TupleConstruct(%a, %b)
   return (%7)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
-        ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
+      ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingDifferentDevices) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %2 : int = prim::Constant[value=2]()
   %1 : int = prim::Constant[value=1]()
@@ -70,22 +72,21 @@ graph():
   prim::Print(%x, %y, %z)
   return (%1)
   )IR",
-        &*graph);
-    // three tensors created - two different devices among the three
-    // don't have good support for parsing tensor constants
-    ConstantPropagation(graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count(
-            "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->check_count(
-            "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->run(*graph);
-  }
+      &*graph);
+  // three tensors created - two different devices among the three
+  // don't have good support for parsing tensor constants
+  ConstantPropagation(graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count(
+          "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->check_count(
+          "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->run(*graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
index 8da6d9d6a1b2..e97043f84d24 100644
--- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp
+++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
@@ -6,7 +7,7 @@
 namespace torch {
 namespace jit {
 
-void testCreateAutodiffSubgraphs() {
+TEST(CreateAutodiffSubgraphsTest, Basic) {
   auto graph = build_lstm();
   CreateAutodiffSubgraphs(graph, /*threshold=*/2);
   // all of the ops are within the DifferentiableGraph
diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp
index 543fbc20eb3d..a96a3b4a5635 100644
--- a/test/cpp/jit/test_custom_class.cpp
+++ b/test/cpp/jit/test_custom_class.cpp
@@ -1,3 +1,6 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_custom_class_registrations.h>
 #include <torch/custom_class.h>
 #include <torch/script.h>
 
@@ -8,317 +11,7 @@
 namespace torch {
 namespace jit {
 
-namespace {
-
-struct Foo : torch::CustomClassHolder {
-  int x, y;
-  Foo() : x(0), y(0) {}
-  Foo(int x_, int y_) : x(x_), y(y_) {}
-  int64_t info() {
-    return this->x * this->y;
-  }
-  int64_t add(int64_t z) {
-    return (x + y) * z;
-  }
-  void increment(int64_t z) {
-    this->x += z;
-    this->y += z;
-  }
-  int64_t combine(c10::intrusive_ptr<Foo> b) {
-    return this->info() + b->info();
-  }
-  ~Foo() {
-    // std::cout<<"Destroying object with values: "<<x<<' '<<y<<std::endl;
-  }
-};
-
-struct NoInit : torch::CustomClassHolder {
-  int64_t x;
-};
-
-template <class T>
-struct MyStackClass : torch::CustomClassHolder {
-  std::vector<T> stack_;
-  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
-
-  void push(T x) {
-    stack_.push_back(x);
-  }
-  T pop() {
-    auto val = stack_.back();
-    stack_.pop_back();
-    return val;
-  }
-
-  c10::intrusive_ptr<MyStackClass> clone() const {
-    return c10::make_intrusive<MyStackClass>(stack_);
-  }
-
-  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
-    for (auto& elem : c->stack_) {
-      push(elem);
-    }
-  }
-
-  std::tuple<double, int64_t> return_a_tuple() const {
-    return std::make_tuple(1337.0f, 123);
-  }
-};
-
-struct PickleTester : torch::CustomClassHolder {
-  PickleTester(std::vector<int64_t> vals) : vals(std::move(vals)) {}
-  std::vector<int64_t> vals;
-};
-
-at::Tensor take_an_instance(const c10::intrusive_ptr<PickleTester>& instance) {
-  return torch::zeros({instance->vals.back(), 4});
-}
-
-struct ElementwiseInterpreter : torch::CustomClassHolder {
-  using InstructionType = std::tuple<
-      std::string /*op*/,
-      std::vector<std::string> /*inputs*/,
-      std::string /*output*/>;
-
-  ElementwiseInterpreter() {}
-
-  // Load a list of instructions into the interpreter. As specified above,
-  // instructions specify the operation (currently support "add" and "mul"),
-  // the names of the input values, and the name of the single output value
-  // from this instruction
-  void setInstructions(std::vector<InstructionType> instructions) {
-    instructions_ = std::move(instructions);
-  }
-
-  // Add a constant. The interpreter maintains a set of constants across
-  // calls. They are keyed by name, and constants can be referenced in
-  // Instructions by the name specified
-  void addConstant(const std::string& name, at::Tensor value) {
-    constants_.insert_or_assign(name, std::move(value));
-  }
-
-  // Set the string names for the positional inputs to the function this
-  // interpreter represents. When invoked, the interpreter will assign
-  // the positional inputs to the names in the corresponding position in
-  // input_names.
-  void setInputNames(std::vector<std::string> input_names) {
-    input_names_ = std::move(input_names);
-  }
-
-  // Specify the output name for the function this interpreter represents. This
-  // should match the "output" field of one of the instructions in the
-  // instruction list, typically the last instruction.
-  void setOutputName(std::string output_name) {
-    output_name_ = std::move(output_name);
-  }
-
-  // Invoke this interpreter. This takes a list of positional inputs and returns
-  // a single output. Currently, inputs and outputs must all be Tensors.
-  at::Tensor __call__(std::vector<at::Tensor> inputs) {
-    // Environment to hold local variables
-    std::unordered_map<std::string, at::Tensor> environment;
-
-    // Load inputs according to the specified names
-    if (inputs.size() != input_names_.size()) {
-      std::stringstream err;
-      err << "Expected " << input_names_.size() << " inputs, but got "
-          << inputs.size() << "!";
-      throw std::runtime_error(err.str());
-    }
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      environment[input_names_[i]] = inputs[i];
-    }
-
-    for (InstructionType& instr : instructions_) {
-      // Retrieve all input values for this op
-      std::vector<at::Tensor> inputs;
-      for (const auto& input_name : std::get<1>(instr)) {
-        // Operator output values shadow constants.
-        // Imagine all constants are defined in statements at the beginning
-        // of a function (a la K&R C). Any definition of an output value must
-        // necessarily come after constant definition in textual order. Thus,
-        // We look up values in the environment first then the constant table
-        // second to implement this shadowing behavior
-        if (environment.find(input_name) != environment.end()) {
-          inputs.push_back(environment.at(input_name));
-        } else if (constants_.find(input_name) != constants_.end()) {
-          inputs.push_back(constants_.at(input_name));
-        } else {
-          std::stringstream err;
-          err << "Instruction referenced unknown value " << input_name << "!";
-          throw std::runtime_error(err.str());
-        }
-      }
-
-      // Run the specified operation
-      at::Tensor result;
-      const auto& op = std::get<0>(instr);
-      if (op == "add") {
-        if (inputs.size() != 2) {
-          throw std::runtime_error("Unexpected number of inputs for add op!");
-        }
-        result = inputs[0] + inputs[1];
-      } else if (op == "mul") {
-        if (inputs.size() != 2) {
-          throw std::runtime_error("Unexpected number of inputs for mul op!");
-        }
-        result = inputs[0] * inputs[1];
-      } else {
-        std::stringstream err;
-        err << "Unknown operator " << op << "!";
-        throw std::runtime_error(err.str());
-      }
-
-      // Write back result into environment
-      const auto& output_name = std::get<2>(instr);
-      environment[output_name] = std::move(result);
-    }
-
-    if (!output_name_) {
-      throw std::runtime_error("Output name not specififed!");
-    }
-
-    return environment.at(*output_name_);
-  }
-
-  // Ser/De infrastructure. See
-  // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes
-  // for more info.
-
-  // This is the type we will use to marshall information on disk during
-  // ser/de. It is a simple tuple composed of primitive types and simple
-  // collection types like vector, optional, and dict.
-  using SerializationType = std::tuple<
-      std::vector<std::string> /*input_names_*/,
-      c10::optional<std::string> /*output_name_*/,
-      c10::Dict<std::string, at::Tensor> /*constants_*/,
-      std::vector<InstructionType> /*instructions_*/
-      >;
-
-  // This function yields the SerializationType instance for `this`.
-  SerializationType __getstate__() const {
-    return SerializationType{
-        input_names_, output_name_, constants_, instructions_};
-  }
-
-  // This function will create an instance of `ElementwiseInterpreter` given
-  // an instance of `SerializationType`.
-  static c10::intrusive_ptr<ElementwiseInterpreter> __setstate__(
-      SerializationType state) {
-    auto instance = c10::make_intrusive<ElementwiseInterpreter>();
-    std::tie(
-        instance->input_names_,
-        instance->output_name_,
-        instance->constants_,
-        instance->instructions_) = std::move(state);
-    return instance;
-  }
-
-  // Class members
-  std::vector<std::string> input_names_;
-  c10::optional<std::string> output_name_;
-  c10::Dict<std::string, at::Tensor> constants_;
-  std::vector<InstructionType> instructions_;
-};
-
-TORCH_LIBRARY(_TorchScriptTesting, m) {
-  m.class_<Foo>("_Foo")
-      .def(torch::init<int64_t, int64_t>())
-      // .def(torch::init<>())
-      .def("info", &Foo::info)
-      .def("increment", &Foo::increment)
-      .def("add", &Foo::add)
-      .def("combine", &Foo::combine);
-
-  m.class_<NoInit>("_NoInit").def(
-      "get_x", [](const c10::intrusive_ptr<NoInit>& self) { return self->x; });
-
-  m.class_<MyStackClass<std::string>>("_StackString")
-      .def(torch::init<std::vector<std::string>>())
-      .def("push", &MyStackClass<std::string>::push)
-      .def("pop", &MyStackClass<std::string>::pop)
-      .def("clone", &MyStackClass<std::string>::clone)
-      .def("merge", &MyStackClass<std::string>::merge)
-      .def_pickle(
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-            return self->stack_;
-          },
-          [](std::vector<std::string> state) { // __setstate__
-            return c10::make_intrusive<MyStackClass<std::string>>(
-                std::vector<std::string>{"i", "was", "deserialized"});
-          })
-      .def("return_a_tuple", &MyStackClass<std::string>::return_a_tuple)
-      .def(
-          "top",
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
-              -> std::string { return self->stack_.back(); })
-      .def(
-          "__str__",
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-            std::stringstream ss;
-            ss << "[";
-            for (size_t i = 0; i < self->stack_.size(); ++i) {
-              ss << self->stack_[i];
-              if (i != self->stack_.size() - 1) {
-                ss << ", ";
-              }
-            }
-            ss << "]";
-            return ss.str();
-          });
-  // clang-format off
-        // The following will fail with a static assert telling you you have to
-        // take an intrusive_ptr<MyStackClass> as the first argument.
-        // .def("foo", [](int64_t a) -> int64_t{ return 3;});
-  // clang-format on
-
-  m.class_<PickleTester>("_PickleTester")
-      .def(torch::init<std::vector<int64_t>>())
-      .def_pickle(
-          [](c10::intrusive_ptr<PickleTester> self) { // __getstate__
-            return std::vector<int64_t>{1, 3, 3, 7};
-          },
-          [](std::vector<int64_t> state) { // __setstate__
-            return c10::make_intrusive<PickleTester>(std::move(state));
-          })
-      .def(
-          "top",
-          [](const c10::intrusive_ptr<PickleTester>& self) {
-            return self->vals.back();
-          })
-      .def("pop", [](const c10::intrusive_ptr<PickleTester>& self) {
-        auto val = self->vals.back();
-        self->vals.pop_back();
-        return val;
-      });
-
-  m.def(
-      "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y",
-      take_an_instance);
-  // test that schema inference is ok too
-  m.def("take_an_instance_inferred", take_an_instance);
-
-  m.class_<ElementwiseInterpreter>("_ElementwiseInterpreter")
-      .def(torch::init<>())
-      .def("set_instructions", &ElementwiseInterpreter::setInstructions)
-      .def("add_constant", &ElementwiseInterpreter::addConstant)
-      .def("set_input_names", &ElementwiseInterpreter::setInputNames)
-      .def("set_output_name", &ElementwiseInterpreter::setOutputName)
-      .def("__call__", &ElementwiseInterpreter::__call__)
-      .def_pickle(
-          /* __getstate__ */
-          [](const c10::intrusive_ptr<ElementwiseInterpreter>& self) {
-            return self->__getstate__();
-          },
-          /* __setstate__ */
-          [](ElementwiseInterpreter::SerializationType state) {
-            return ElementwiseInterpreter::__setstate__(std::move(state));
-          });
-}
-
-} // namespace
-
-void testTorchbindIValueAPI() {
+TEST(CustomClassTest, TorchbindIValueAPI) {
   script::Module m("m");
 
   // test make_custom_class API
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
new file mode 100644
index 000000000000..f563120bbc6c
--- /dev/null
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -0,0 +1,291 @@
+#include <test/cpp/jit/test_custom_class_registrations.h>
+
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace torch::jit;
+
+namespace {
+
+struct Foo : torch::CustomClassHolder {
+  int x, y;
+  Foo() : x(0), y(0) {}
+  Foo(int x_, int y_) : x(x_), y(y_) {}
+  int64_t info() {
+    return this->x * this->y;
+  }
+  int64_t add(int64_t z) {
+    return (x + y) * z;
+  }
+  void increment(int64_t z) {
+    this->x += z;
+    this->y += z;
+  }
+  int64_t combine(c10::intrusive_ptr<Foo> b) {
+    return this->info() + b->info();
+  }
+  ~Foo() {
+    // std::cout<<"Destroying object with values: "<<x<<' '<<y<<std::endl;
+  }
+};
+
+struct NoInit : torch::CustomClassHolder {
+  int64_t x;
+};
+
+struct PickleTester : torch::CustomClassHolder {
+  PickleTester(std::vector<int64_t> vals) : vals(std::move(vals)) {}
+  std::vector<int64_t> vals;
+};
+
+at::Tensor take_an_instance(const c10::intrusive_ptr<PickleTester>& instance) {
+  return torch::zeros({instance->vals.back(), 4});
+}
+
+struct ElementwiseInterpreter : torch::CustomClassHolder {
+  using InstructionType = std::tuple<
+      std::string /*op*/,
+      std::vector<std::string> /*inputs*/,
+      std::string /*output*/>;
+
+  ElementwiseInterpreter() {}
+
+  // Load a list of instructions into the interpreter. As specified above,
+  // instructions specify the operation (currently support "add" and "mul"),
+  // the names of the input values, and the name of the single output value
+  // from this instruction
+  void setInstructions(std::vector<InstructionType> instructions) {
+    instructions_ = std::move(instructions);
+  }
+
+  // Add a constant. The interpreter maintains a set of constants across
+  // calls. They are keyed by name, and constants can be referenced in
+  // Instructions by the name specified
+  void addConstant(const std::string& name, at::Tensor value) {
+    constants_.insert_or_assign(name, std::move(value));
+  }
+
+  // Set the string names for the positional inputs to the function this
+  // interpreter represents. When invoked, the interpreter will assign
+  // the positional inputs to the names in the corresponding position in
+  // input_names.
+  void setInputNames(std::vector<std::string> input_names) {
+    input_names_ = std::move(input_names);
+  }
+
+  // Specify the output name for the function this interpreter represents. This
+  // should match the "output" field of one of the instructions in the
+  // instruction list, typically the last instruction.
+  void setOutputName(std::string output_name) {
+    output_name_ = std::move(output_name);
+  }
+
+  // Invoke this interpreter. This takes a list of positional inputs and returns
+  // a single output. Currently, inputs and outputs must all be Tensors.
+  at::Tensor __call__(std::vector<at::Tensor> inputs) {
+    // Environment to hold local variables
+    std::unordered_map<std::string, at::Tensor> environment;
+
+    // Load inputs according to the specified names
+    if (inputs.size() != input_names_.size()) {
+      std::stringstream err;
+      err << "Expected " << input_names_.size() << " inputs, but got "
+          << inputs.size() << "!";
+      throw std::runtime_error(err.str());
+    }
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      environment[input_names_[i]] = inputs[i];
+    }
+
+    for (InstructionType& instr : instructions_) {
+      // Retrieve all input values for this op
+      std::vector<at::Tensor> inputs;
+      for (const auto& input_name : std::get<1>(instr)) {
+        // Operator output values shadow constants.
+        // Imagine all constants are defined in statements at the beginning
+        // of a function (a la K&R C). Any definition of an output value must
+        // necessarily come after constant definition in textual order. Thus,
+        // We look up values in the environment first then the constant table
+        // second to implement this shadowing behavior
+        if (environment.find(input_name) != environment.end()) {
+          inputs.push_back(environment.at(input_name));
+        } else if (constants_.find(input_name) != constants_.end()) {
+          inputs.push_back(constants_.at(input_name));
+        } else {
+          std::stringstream err;
+          err << "Instruction referenced unknown value " << input_name << "!";
+          throw std::runtime_error(err.str());
+        }
+      }
+
+      // Run the specified operation
+      at::Tensor result;
+      const auto& op = std::get<0>(instr);
+      if (op == "add") {
+        if (inputs.size() != 2) {
+          throw std::runtime_error("Unexpected number of inputs for add op!");
+        }
+        result = inputs[0] + inputs[1];
+      } else if (op == "mul") {
+        if (inputs.size() != 2) {
+          throw std::runtime_error("Unexpected number of inputs for mul op!");
+        }
+        result = inputs[0] * inputs[1];
+      } else {
+        std::stringstream err;
+        err << "Unknown operator " << op << "!";
+        throw std::runtime_error(err.str());
+      }
+
+      // Write back result into environment
+      const auto& output_name = std::get<2>(instr);
+      environment[output_name] = std::move(result);
+    }
+
+    if (!output_name_) {
+      throw std::runtime_error("Output name not specififed!");
+    }
+
+    return environment.at(*output_name_);
+  }
+
+  // Ser/De infrastructure. See
+  // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes
+  // for more info.
+
+  // This is the type we will use to marshall information on disk during
+  // ser/de. It is a simple tuple composed of primitive types and simple
+  // collection types like vector, optional, and dict.
+  using SerializationType = std::tuple<
+      std::vector<std::string> /*input_names_*/,
+      c10::optional<std::string> /*output_name_*/,
+      c10::Dict<std::string, at::Tensor> /*constants_*/,
+      std::vector<InstructionType> /*instructions_*/
+      >;
+
+  // This function yields the SerializationType instance for `this`.
+  SerializationType __getstate__() const {
+    return SerializationType{
+        input_names_, output_name_, constants_, instructions_};
+  }
+
+  // This function will create an instance of `ElementwiseInterpreter` given
+  // an instance of `SerializationType`.
+  static c10::intrusive_ptr<ElementwiseInterpreter> __setstate__(
+      SerializationType state) {
+    auto instance = c10::make_intrusive<ElementwiseInterpreter>();
+    std::tie(
+        instance->input_names_,
+        instance->output_name_,
+        instance->constants_,
+        instance->instructions_) = std::move(state);
+    return instance;
+  }
+
+  // Class members
+  std::vector<std::string> input_names_;
+  c10::optional<std::string> output_name_;
+  c10::Dict<std::string, at::Tensor> constants_;
+  std::vector<InstructionType> instructions_;
+};
+
+TORCH_LIBRARY(_TorchScriptTesting, m) {
+  m.class_<Foo>("_Foo")
+      .def(torch::init<int64_t, int64_t>())
+      // .def(torch::init<>())
+      .def("info", &Foo::info)
+      .def("increment", &Foo::increment)
+      .def("add", &Foo::add)
+      .def("combine", &Foo::combine);
+
+  m.class_<NoInit>("_NoInit").def(
+      "get_x", [](const c10::intrusive_ptr<NoInit>& self) { return self->x; });
+
+  m.class_<MyStackClass<std::string>>("_StackString")
+      .def(torch::init<std::vector<std::string>>())
+      .def("push", &MyStackClass<std::string>::push)
+      .def("pop", &MyStackClass<std::string>::pop)
+      .def("clone", &MyStackClass<std::string>::clone)
+      .def("merge", &MyStackClass<std::string>::merge)
+      .def_pickle(
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+            return self->stack_;
+          },
+          [](std::vector<std::string> state) { // __setstate__
+            return c10::make_intrusive<MyStackClass<std::string>>(
+                std::vector<std::string>{"i", "was", "deserialized"});
+          })
+      .def("return_a_tuple", &MyStackClass<std::string>::return_a_tuple)
+      .def(
+          "top",
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
+              -> std::string { return self->stack_.back(); })
+      .def(
+          "__str__",
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+            std::stringstream ss;
+            ss << "[";
+            for (size_t i = 0; i < self->stack_.size(); ++i) {
+              ss << self->stack_[i];
+              if (i != self->stack_.size() - 1) {
+                ss << ", ";
+              }
+            }
+            ss << "]";
+            return ss.str();
+          });
+  // clang-format off
+        // The following will fail with a static assert telling you you have to
+        // take an intrusive_ptr<MyStackClass> as the first argument.
+        // .def("foo", [](int64_t a) -> int64_t{ return 3;});
+  // clang-format on
+
+  m.class_<PickleTester>("_PickleTester")
+      .def(torch::init<std::vector<int64_t>>())
+      .def_pickle(
+          [](c10::intrusive_ptr<PickleTester> self) { // __getstate__
+            return std::vector<int64_t>{1, 3, 3, 7};
+          },
+          [](std::vector<int64_t> state) { // __setstate__
+            return c10::make_intrusive<PickleTester>(std::move(state));
+          })
+      .def(
+          "top",
+          [](const c10::intrusive_ptr<PickleTester>& self) {
+            return self->vals.back();
+          })
+      .def("pop", [](const c10::intrusive_ptr<PickleTester>& self) {
+        auto val = self->vals.back();
+        self->vals.pop_back();
+        return val;
+      });
+
+  m.def(
+      "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y",
+      take_an_instance);
+  // test that schema inference is ok too
+  m.def("take_an_instance_inferred", take_an_instance);
+
+  m.class_<ElementwiseInterpreter>("_ElementwiseInterpreter")
+      .def(torch::init<>())
+      .def("set_instructions", &ElementwiseInterpreter::setInstructions)
+      .def("add_constant", &ElementwiseInterpreter::addConstant)
+      .def("set_input_names", &ElementwiseInterpreter::setInputNames)
+      .def("set_output_name", &ElementwiseInterpreter::setOutputName)
+      .def("__call__", &ElementwiseInterpreter::__call__)
+      .def_pickle(
+          /* __getstate__ */
+          [](const c10::intrusive_ptr<ElementwiseInterpreter>& self) {
+            return self->__getstate__();
+          },
+          /* __setstate__ */
+          [](ElementwiseInterpreter::SerializationType state) {
+            return ElementwiseInterpreter::__setstate__(std::move(state));
+          });
+}
+
+} // namespace
diff --git a/test/cpp/jit/test_custom_class_registrations.h b/test/cpp/jit/test_custom_class_registrations.h
new file mode 100644
index 000000000000..4e6b7bd43883
--- /dev/null
+++ b/test/cpp/jit/test_custom_class_registrations.h
@@ -0,0 +1,36 @@
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+namespace torch {
+namespace jit {
+
+template <class T>
+struct MyStackClass : torch::CustomClassHolder {
+  std::vector<T> stack_;
+  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
+
+  void push(T x) {
+    stack_.push_back(x);
+  }
+  T pop() {
+    auto val = stack_.back();
+    stack_.pop_back();
+    return val;
+  }
+
+  c10::intrusive_ptr<MyStackClass> clone() const {
+    return c10::make_intrusive<MyStackClass>(stack_);
+  }
+
+  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
+    for (auto& elem : c->stack_) {
+      push(elem);
+    }
+  }
+
+  std::tuple<double, int64_t> return_a_tuple() const {
+    return std::make_tuple(1337.0f, 123);
+  }
+};
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index 529b36385bd4..d3f61268e8f1 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/ir/alias_analysis.h"
@@ -11,134 +12,135 @@
 namespace torch {
 namespace jit {
 
-void testCustomOperators() {
-  {
-    torch::RegisterOperators reg(
-        "foo::bar", [](double a, at::Tensor b) { return a + b; });
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, InferredSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar", [](double a, at::Tensor b) { return a + b; });
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  ASSERT_EQ(ops.size(), 1);
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar");
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar");
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::bar_with_schema(float a, Tensor b) -> Tensor",
-        [](double a, at::Tensor b) { return a + b; });
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops =
-        getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ExplicitSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
+      [](double a, at::Tensor b) { return a + b; });
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
+  auto& ops =
+      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    // Check that lists work well.
-    torch::RegisterOperators reg(
-        "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
-        [](torch::List<int64_t> ints,
-           torch::List<double> floats,
-           torch::List<at::Tensor> tensors) { return floats; });
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists");
-
-    ASSERT_EQ(op->schema().arguments().size(), 3);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
-    ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
-    ASSERT_TRUE(
-        op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
-    ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
-
-    Stack stack;
-    push(stack, c10::List<int64_t>({1, 2}));
-    push(stack, c10::List<double>({1.0, 2.0}));
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<double> output;
-    pop(stack, output);
-
-    ASSERT_EQ(output.size(), 2);
-    ASSERT_EQ(output.get(0), 1.0);
-    ASSERT_EQ(output.get(1), 2.0);
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::lists2(Tensor[] tensors) -> Tensor[]",
-        [](torch::List<at::Tensor> tensors) { return tensors; });
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ListParameters) {
+  // Check that lists work well.
+  torch::RegisterOperators reg(
+      "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
+      [](torch::List<int64_t> ints,
+         torch::List<double> floats,
+         torch::List<at::Tensor> tensors) { return floats; });
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists");
+
+  ASSERT_EQ(op->schema().arguments().size(), 3);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
+  ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
+  ASSERT_TRUE(
+      op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
+  ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
+
+  Stack stack;
+  push(stack, c10::List<int64_t>({1, 2}));
+  push(stack, c10::List<double>({1.0, 2.0}));
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<double> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 2);
+  ASSERT_EQ(output.get(0), 1.0);
+  ASSERT_EQ(output.get(1), 2.0);
+}
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists2");
+TEST(CustomOperatorTest, ListParameters2) {
+  torch::RegisterOperators reg(
+      "foo::lists2(Tensor[] tensors) -> Tensor[]",
+      [](torch::List<at::Tensor> tensors) { return tensors; });
 
-    ASSERT_EQ(op->schema().arguments().size(), 1);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists2");
 
-    Stack stack;
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<at::Tensor> output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().arguments().size(), 1);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
 
-    ASSERT_EQ(output.size(), 1);
-    ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
-  }
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+
+  Stack stack;
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<at::Tensor> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
 }
 
-void testCustomOperatorAliasing() {
+TEST(CustomOperatorTest, Aliasing) {
   torch::RegisterOperators reg(
       "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor {
         a.add_(b);
@@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor):
   }
 }
 
-void testIValueKWargs() {
-  const auto text = R"(
-    def foo(a : int, b : int, c : int = 4):
-      return a + 2*b + 3*c
-  )";
-  auto cu = compile(text);
-  auto result = cu->get_function("foo")({1}, {{"b", 3}});
-  ASSERT_EQ(result.toInt(), 19);
-}
-
-void testTemplatedOperatorCreator() {
-  constexpr char op_list[] = "foofoo::bar.template;foo::another";
+static constexpr char op_list[] = "foofoo::bar.template;foo::another";
 #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n)                                   \
   torch::detail::SelectiveStr<c10::impl::op_whitelist_contains_name_in_schema( \
       l, n)>(n)
 
-  {
-    // Try to register an op name that does not exist in op_list.
-    // Expected: the op name is not registered.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
-    ASSERT_EQ(ops.size(), 0);
-  }
+TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
+  // Try to register an op name that does not exist in op_list.
+  // Expected: the op name is not registered.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  ASSERT_EQ(ops.size(), 0);
+}
 
-  {
-    // The operator should be successfully registered since its name is in the
-    // whitelist.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foofoo::bar");
-
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
-
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
-
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
+TEST(TestCustomOperator, OperatorGeneratorBasic) {
+  // The operator should be successfully registered since its name is in the
+  // whitelist.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foofoo::bar");
+
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp
index 5799913c316a..6f9161d0d9ae 100644
--- a/test/cpp/jit/test_dce.cpp
+++ b/test/cpp/jit/test_dce.cpp
@@ -1,12 +1,12 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
 namespace jit {
-void testDCE() {
+TEST(EliminateDeadCodeTest, Basic) {
   auto graph = std::make_shared<Graph>();
 
   // Consider the following loop:
diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index ee0ea060f02f..ef595215b882 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -1,4 +1,4 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include "ATen/core/interned_strings.h"
@@ -56,28 +56,27 @@
 namespace torch {
 namespace jit {
 
-void testFusion() {
-  auto testSimple = [&] {
-    const auto graph_string = R"IR(
+TEST(FuserTest, TestSimple_CUDA) {
+  const auto graph_string = R"IR(
       graph(%0 : Tensor,
             %1 : Tensor):
         %2 : Tensor = aten::mul(%0, %1)
         return (%2))IR";
-    Graph graph;
-    torch::jit::parseIR(graph_string, &graph);
-
-    auto a = at::rand({3, 4}, at::kCUDA);
-    auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
-    auto o = at::zeros({3, 4}, at::kCUDA);
-    auto outputs = debugLaunchGraph(graph, {a, b});
-    ASSERT_EQ(outputs.size(), 1);
-    auto o2 = a * b;
-    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
-    // std::cout << "max diff: " << max_diff << "\n";
-    ASSERT_EQ(max_diff, 0);
-  };
-  testSimple();
+  Graph graph;
+  torch::jit::parseIR(graph_string, &graph);
+
+  auto a = at::rand({3, 4}, at::kCUDA);
+  auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
+  auto o = at::zeros({3, 4}, at::kCUDA);
+  auto outputs = debugLaunchGraph(graph, {a, b});
+  ASSERT_EQ(outputs.size(), 1);
+  auto o2 = a * b;
+  float max_diff = (o2 - outputs[0]).abs().max().item<double>();
+  // std::cout << "max diff: " << max_diff << "\n";
+  ASSERT_EQ(max_diff, 0);
+}
 
+TEST(FuserTest, TestOne_CUDA) {
   auto testOne = [&](int ti, int tj) {
     const auto graph_string = R"IR(
       graph(%0 : Tensor,
@@ -132,7 +131,9 @@ void testFusion() {
   testOne(0, 1);
   testOne(1, 2);
   testOne(0, 2);
+}
 
+TEST(FuserTest, FusedConcat_CUDA) {
   const auto graph_string0 = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -175,7 +176,7 @@ void testFusion() {
   };
 }
 
-void testFusionAliasing() {
+TEST(FuserTest, FusionAliasing) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -200,7 +201,7 @@ void testFusionAliasing() {
       ->run(*g);
 }
 
-void testRegisterFusionCachesKernel() {
+TEST(FuserTest, KernelCaching) {
   // Constructs two functionally equivalent graphs
   const auto graph0_string = R"IR(
     graph(%0 : Float(2, 3, 4),
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index 80fa318d653a..38008d417256 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -1,8 +1,8 @@
 #if defined(USE_CUDA)
-
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
@@ -11,6 +11,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler.h>
@@ -73,11 +74,11 @@ TensorView* makeTensorWithContig(
 }
 
 void checkIntValue(
-    const EvaluationContext* eval_context,
+    StatefulExpressionEvaluator& evaluator,
     Val* val,
     Int::ScalarType expected_value) {
   TORCH_CHECK(val->isAnInt());
-  const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context);
+  const auto actual_value = evaluator.inferValue(val);
   TORCH_CHECK(actual_value.has_value());
   TORCH_CHECK(actual_value.value() == expected_value);
 }
@@ -91,7 +92,7 @@ void checkIntValue(
 // (These tests exercise IrGraphGenerator through a non-trivial IR,
 //  to make sure that it runs w/o crashing. The actual output is not
 //  validated)
-void testGPU_IrGraphGenerator() {
+TEST(NVFuserTest, IrGraphGenerator_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -143,7 +144,7 @@ void testGPU_IrGraphGenerator() {
                    .empty());
 }
 
-void testGPU_FusionDispatch() {
+TEST(NVFuserTest, FusionDispatch_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -158,28 +159,28 @@ void testGPU_FusionDispatch() {
 }
 
 // Evaluate basic scalar operations with constant values
-void testGPU_FusionExprEvalConstants() {
+TEST(NVFuserTest, FusionExprEvalConstants_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   auto* a = new Int(7);
   auto* b = new Int(3);
 
-  checkIntValue(&eval_context, neg(a), -7);
-  checkIntValue(&eval_context, add(a, b), 10);
-  checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8);
-  checkIntValue(&eval_context, mod(a, b), 1);
-  checkIntValue(&eval_context, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, neg(a), -7);
+  checkIntValue(evaluator, add(a, b), 10);
+  checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
 }
 
 // Evaluate basic scalar operations with bound values
-void testGPU_FusionExprEvalBindings() {
+TEST(NVFuserTest, FusionExprEvalBindings_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   auto* a = new Int();
   auto* b = new Int();
@@ -188,39 +189,39 @@ void testGPU_FusionExprEvalBindings() {
   auto* e = new Int(0);
 
   // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value());
-  TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value());
+  TORCH_CHECK(!evaluator.inferValue(a).has_value());
+  TORCH_CHECK(!evaluator.inferValue(d).has_value());
 
-  eval_context.bind(a, 7);
-  eval_context.bind(b, 3);
+  evaluator.safeBind(a, 7);
+  evaluator.safeBind(b, 3);
 
   // can't bind to the results of expressions
-  ASSERT_ANY_THROW(eval_context.bind(c, 100));
+  ASSERT_ANY_THROW(evaluator.safeBind(c, 100));
 
   // can't bind to concrete values
-  ASSERT_ANY_THROW(eval_context.bind(e, 100));
+  ASSERT_ANY_THROW(evaluator.safeBind(e, 100));
 
-  checkIntValue(&eval_context, c, 10);
-  checkIntValue(&eval_context, sub(a, b), 4);
-  checkIntValue(&eval_context, mod(a, b), 1);
-  checkIntValue(&eval_context, ceilDiv(a, b), 3);
-  checkIntValue(&eval_context, d, -4);
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, sub(a, b), 4);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, d, -4);
 
   // Reset evaluation context
-  eval_context = EvaluationContext(&fusion);
+  evaluator = StatefulExpressionEvaluator(&fusion);
 
-  eval_context.bind(a, 2);
-  eval_context.bind(b, 5);
+  evaluator.safeBind(a, 2);
+  evaluator.safeBind(b, 5);
 
-  checkIntValue(&eval_context, c, 7);
-  checkIntValue(&eval_context, sub(a, b), -3);
-  checkIntValue(&eval_context, mod(a, b), 2);
-  checkIntValue(&eval_context, ceilDiv(a, b), 1);
-  checkIntValue(&eval_context, d, -2);
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, sub(a, b), -3);
+  checkIntValue(evaluator, mod(a, b), 2);
+  checkIntValue(evaluator, ceilDiv(a, b), 1);
+  checkIntValue(evaluator, d, -2);
 }
 
 // Evaluate expressions in a simple IR
-void testGPU_FusionExprEvalBasic() {
+TEST(NVFuserTest, FusionExprEvalBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -247,8 +248,8 @@ void testGPU_FusionExprEvalBasic() {
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
 
-  // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  // 1. Create an evaluator
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
   //
@@ -258,25 +259,25 @@ void testGPU_FusionExprEvalBasic() {
   //  (ex. `tv0->getRootDomain()[0]->extent()`
   //   instead of `tv0->axis(0)->extent()`)
   //
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 128);
-  eval_context.bind(tv1->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv1->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128);
 
   // 3. Evaluate and check result values
   TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128);
 
   TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128);
 }
 
 // Evaluate expressions in a more complex IR
-void testGPU_FusionExprEvalComplex() {
+TEST(NVFuserTest, FusionExprEvalComplex_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -298,37 +299,37 @@ void testGPU_FusionExprEvalComplex() {
   tv6->split(0, 5);
   tv5->merge(0);
 
-  // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  // 1. Create an evaluator
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 129);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 127);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127);
 
   // Evaluate and check extent values
   TORCH_CHECK(tv0->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv3->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv4->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv5->domain()->nDims() == 1);
-  checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383);
+  checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383);
 
   TORCH_CHECK(tv6->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26);
-  checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5);
-  checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127);
+  checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26);
+  checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5);
+  checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127);
 }
 
 // Evaluate expressions post lowering
-void testGPU_FusionExprEvalPostLower() {
+TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -360,34 +361,32 @@ void testGPU_FusionExprEvalPostLower() {
 
   // Lower
   GpuLower gpulw(&fusion);
-  std::stringstream kernel;
-  gpulw.printKernel(kernel);
 
   // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 128);
-  eval_context.bind(tv1->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv1->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128);
 
   // 3. Evaluate and check result values
   TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128);
 
   TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128);
 
-  checkIntValue(&eval_context, bid_x, 2);
-  checkIntValue(&eval_context, tid_x, 128);
+  checkIntValue(evaluator, bid_x, 2);
+  checkIntValue(evaluator, tid_x, 128);
 }
 
-void testGPU_FusionClear() {
+TEST(NVFuserTest, FusionClear_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -467,7 +466,7 @@ void testGPU_FusionClear() {
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionCopy() {
+TEST(NVFuserTest, FusionCopy_CUDA) {
   Fusion original_fusion;
 
   // Create the test IR
@@ -505,10 +504,12 @@ void testGPU_FusionCopy() {
   ASSERT_EQ(original_ir.str(), clone_ir.str());
 
   // Lower original fusion
-  std::stringstream original_kernel;
+  std::string original_kernel;
   {
-    GpuLower lower(&original_fusion);
-    lower.printKernel(original_kernel);
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&original_fusion);
+    original_kernel =
+        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
   }
 
   // Make sure the "before lowering" clone was not mutated
@@ -529,15 +530,17 @@ void testGPU_FusionCopy() {
   ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
 
   // Lower the "before lowering" and compare kernels
-  std::stringstream clone_kernel;
+  std::string clone_kernel;
   {
-    GpuLower lower(&before_lowering);
-    lower.printKernel(clone_kernel);
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&before_lowering);
+    clone_kernel =
+        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
   }
-  ASSERT_EQ(original_kernel.str(), clone_kernel.str());
+  ASSERT_EQ(original_kernel, clone_kernel);
 }
 
-void testGPU_FusionMove() {
+TEST(NVFuserTest, FusionMove_CUDA) {
   Fusion fusion;
 
   // Create the test IR
@@ -593,9 +596,7 @@ void testGPU_FusionMove() {
   ASSERT_EQ(original_ir.str(), another_ir.str());
 
   // Lower the fusion IR
-  std::stringstream kernel;
   GpuLower lower(&another_fusion);
-  lower.printKernel(kernel);
 
   std::stringstream lowered_ir;
   lowered_ir << another_fusion;
@@ -609,7 +610,7 @@ void testGPU_FusionMove() {
   ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
 }
 
-void testGPU_FusionSimpleArith() {
+TEST(NVFuserTest, FusionSimpleArith_CUDA) {
   std::stringstream ss1, ss2;
 
   Fusion fusion;
@@ -638,7 +639,7 @@ void testGPU_FusionSimpleArith() {
       "Error where explicit add nodes don't match implicit add nodes.");
 }
 
-void testGPU_FusionSimpleTypePromote() {
+TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -661,7 +662,7 @@ class ZeroMutator : public OptOutMutator {
   }
 };
 
-void testGPU_FusionMutator() {
+TEST(NVFuserTest, FusionMutator_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -679,7 +680,7 @@ void testGPU_FusionMutator() {
   TORCH_CHECK(flhs->value().value() == 0.f);
 }
 
-void testGPU_FusionRegister() {
+TEST(NVFuserTest, FusionRegister_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   Float* v1 = new Float{1.f};
@@ -710,7 +711,7 @@ struct DummyExpr : public Expr {
   DummyExpr& operator=(DummyExpr&& other) = delete;
 };
 
-void testGPU_FusionTopoSort() {
+TEST(NVFuserTest, FusionTopoSort_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -777,7 +778,7 @@ void testGPU_FusionTopoSort() {
   TORCH_CHECK(fusion.origin(v6)->name() == 3);
 }
 
-void testGPU_FusionTensor() {
+TEST(NVFuserTest, FusionTensor_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   Fusion fusion;
@@ -799,48 +800,6 @@ void testGPU_FusionTensor() {
     }
   }
 
-  {
-    auto tensor = at::randn({2, 1, 4}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (int i = 0; i < static_cast<int>(fuser_tensor->nDims()); i++) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-
-    // temporary WAR to disable contig & bcast; issue # 230
-    // TODO: insert the check where broadcast & contiguous cannot be marked
-    // together
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-  }
-
-  {
-    auto tensor = at::randn({2, 3, 1}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (int i = 0; i < static_cast<int>(fuser_tensor->nDims()); i++) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
-
-    // temporary WAR to disable contig & bcast; issue # 230
-    // TODO: insert the check where broadcast & contiguous cannot be marked
-    // together
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]);
-  }
-
   // TensorType::create fills stride_properties, which helps us to mark
   // IterDomain properly
   // Note: implementation could change, depending on how much we want to invest
@@ -883,7 +842,7 @@ void testGPU_FusionTensor() {
   }
 }
 
-void testGPU_FusionFilterVals() {
+TEST(NVFuserTest, FusionFilterVals_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -921,7 +880,7 @@ void testGPU_FusionFilterVals() {
       "Not expecting any results");
 }
 
-void testGPU_FusionTVSplit() {
+TEST(NVFuserTest, FusionTVSplit_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -947,7 +906,7 @@ void testGPU_FusionTVSplit() {
       static_cast<Int*>(inner->extent())->value().value() == 2);
 }
 
-void testGPU_FusionTVMerge() {
+TEST(NVFuserTest, FusionTVMerge_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -965,7 +924,7 @@ void testGPU_FusionTVMerge() {
           tv->getRootDomain()[2]->extent());
 }
 
-void testGPU_FusionTVReorder() {
+TEST(NVFuserTest, FusionTVReorder_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1012,7 +971,7 @@ void testGPU_FusionTVReorder() {
   TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
 }
 
-void testGPU_FusionEquality() {
+TEST(NVFuserTest, FusionEquality_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1053,7 +1012,7 @@ void testGPU_FusionEquality() {
   TORCH_CHECK(!neg1->sameAs(neg2));
 }
 
-void testGPU_FusionDependency() {
+TEST(NVFuserTest, FusionDependency_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1123,7 +1082,7 @@ void testGPU_FusionDependency() {
   TORCH_CHECK(dep_chain.empty());
 }
 
-void testGPU_FusionParser() {
+TEST(NVFuserTest, FusionParser_CUDA) {
   auto g = std::make_shared<Graph>();
   const auto graph0_string = R"IR(
     graph(%0 : Float(2:1),
@@ -1156,43 +1115,36 @@ void testGPU_FusionParser() {
   // 1. this can be moved to a dedicated "golden" file
   // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
   const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3){
-  float T2[4];
-  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-    for(size_t i6 = 0; i6 < 4; ++i6 ) {
-      T2[ i6 ]
-         = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]
-         * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ];
+__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
+  float T2[1];
+  if ((((((blockIdx.x * 1) + (1 - 1)) * 128) + threadIdx.x) < T0.size[0])) {
+    for(size_t i6 = 0; i6 < 1; ++i6) {
+      T2[i6]
+        = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+        * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
+      T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+        = T2[i6]
+        * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
     }
   } else {
-    for(size_t i6 = 0; i6 < 4; ++i6 ) {
-      if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-        T2[ i6 ]
-           = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]
-           * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ];
+    for(size_t i6 = 0; i6 < 1; ++i6) {
+      if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) {
+        T2[i6]
+          = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+          * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
       }
-    }
-  }
-  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-    for(size_t i13 = 0; i13 < 4; ++i13 ) {
-      T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]
-         = T2[ i13 ]
-         * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ];
-    }
-  } else {
-    for(size_t i13 = 0; i13 < 4; ++i13 ) {
-      if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-        T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]
-           = T2[ i13 ]
-           * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ];
+      if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) {
+        T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+          = T2[i6]
+          * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
       }
     }
   }
 }
 )";
 
-  std::string actual_kernel = GpuLower(fusion.get()).getKernel();
-  actual_kernel = "\n" + actual_kernel;
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
   if (expected_kernel.size() != actual_kernel.size() ||
       expected_kernel.compare(actual_kernel) != 0) {
     std::cerr
@@ -1210,7 +1162,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Te
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionForLoop() {
+TEST(NVFuserTest, FusionForLoop_CUDA) {
 // TODO(kir): re-enable this test
 //  due to the current "GpuLower guard" approach, we can only create
 //  kernel IR during GpuLower::lower()
@@ -1251,7 +1203,7 @@ void testGPU_FusionForLoop() {
 #endif
 }
 
-void testGPU_FusionCodeGen() {
+TEST(NVFuserTest, FusionCodeGen_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1288,7 +1240,7 @@ void testGPU_FusionCodeGen() {
   TORCH_CHECK(output_ref.equal(output));
 }
 
-void testGPU_FusionCodeGen2() {
+TEST(NVFuserTest, FusionCodeGen2_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1330,7 +1282,7 @@ void testGPU_FusionCodeGen2() {
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionSimplePWise() {
+TEST(NVFuserTest, FusionSimplePWise_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   // dimensionality of the problem
@@ -1387,7 +1339,7 @@ void testGPU_FusionSimplePWise() {
   TORCH_CHECK(output_ref.equal(output));
 }
 
-void testGPU_FusionExecKernel() {
+TEST(NVFuserTest, FusionExecKernel_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1441,7 +1393,7 @@ int ceilDiv_(int a, int b) {
   return (a + b - 1) / b;
 }
 
-void testGPU_FusionAdvancedComputeAt() {
+TEST(NVFuserTest, FusionAdvancedComputeAt_CUDA) {
   // Case 1
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
@@ -1576,11 +1528,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     auto outputs = fe.runFusion({t0});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str());
+    TORCH_CHECK(at::allclose(outputs[0], t5));
     TORCH_CHECK(at::allclose(outputs[1], t6));
   }
 
@@ -1636,11 +1584,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     fe.runFusion({t0, t1}, {kernel_tv3});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str());
+    TORCH_CHECK(at::allclose(kernel_tv3, t3));
   }
 
   // Case 4
@@ -1706,11 +1650,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     auto outputs = fe.runFusion({t0, t1, t2, t3});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str());
+    TORCH_CHECK(at::allclose(outputs[0], t6));
   }
 
   // Case 5
@@ -1752,177 +1692,716 @@ void testGPU_FusionAdvancedComputeAt() {
   }
 }
 
-void testGPU_FusionScalarInputs() {
+TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv0 = makeDummyTensor(1);
   fusion.addInput(tv0);
-  TensorView* tv1 = makeDummyTensor(2);
-  fusion.addInput(tv1);
 
-  Float* f0 = new Float();
-  fusion.addInput(f0);
-  Float* f1 = new Float();
-  fusion.addInput(f1);
-  Float* f2 = new Float();
-  fusion.addInput(f2);
-  Float* f3 = new Float();
-  fusion.addInput(f3);
-  Val* f4 = mul(f0, f1);
-  Val* f5 = sub(f2, f3);
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
 
-  TensorView* tv2 = sub(tv1, f4);
-  TensorView* tv3 = add(tv0, f5);
-  TensorView* tv4 = mul(tv3, tv2);
+  // This computeAt will affect tv2 as well, even though tv2 is not in
+  // the data-flow path between tv1 and tv3. The reason is that tv1 is
+  // now computed at tv3, so tv2 must also be computed at the same
+  // location. Overall, what will happen is basically we merge
+  // expressions of all tensors and compute them in a single loop
+  // nest.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  // Note that tv2 is also computed at tv3.
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+  TORCH_CHECK(!tv3->hasComputeAt());
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+
+  at::Tensor kernel_tv2 = at::empty_like(t0, options);
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv2, kernel_tv3});
+
+  TORCH_CHECK(at::allclose(kernel_tv2, t2));
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+}
 
+// Similar to ComputeAtMultiConsumers, but with a common consumer.
+TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, new Float(5.0));
+  fusion.addOutput(tv3);
   fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
 
-  // Lets setup to actually run
-  while (tv4->nDims() > 1)
-    tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
+  // Computing tv1 at tv3. This will affect tv2 as discussed in
+  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
+  // the common consumer of tv2 and tv3, so they are computed at
+  // tv4. The indirect propagation of the computeAt should stop at the
+  // common consumer, and no further change should occur. More
+  // specifically, tv4 and tv5 should not have a computeAt tensor.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
 
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv4);
+  TORCH_CHECK(tv3->getComputeAtView() == tv4);
+  TORCH_CHECK(!tv4->hasComputeAt());
+  TORCH_CHECK(!tv5->hasComputeAt());
 
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5});
+
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+}
+
+TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv3 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv2, new Float(-1.0));
+  TensorView* tv4 = add(tv1, new Float(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This computeAt will affect all tensors including tv3, tv4 and
+  // tv5, even though it appears to impact only tv1 and tv2. The
+  // reason is that tv1 is now computed at tv3, so tv4 must also be
+  // computed at the same location. Similarly, the consumer of tv4,
+  // tv5, must also be computed at the same location. Overall, what
+  // will happen is basically we merge expressions of all tensors and
+  // compute them in a single loop nest. Internally, this will be
+  // realized by making all tensors, except for those in the path
+  // between tv1 and tv3, computed at tv5, which we call the common
+  // consumer.
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (fusion.hasInput(val) ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtView() == tv2);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+  // tv3 and tv4 are computed at tv5
+  TORCH_CHECK(tv3->getComputeAtView() == tv5);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+  TORCH_CHECK(!tv5->hasComputeAt());
 
   for (Val* val : fusion.vals()) {
     if (!fusion.hasInput(val) &&
         val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
+      TensorView* tv = val->as<TensorView>();
       tv->axis(1)->parallelize(ParallelType::Unroll);
       tv->axis(-1)->parallelize(ParallelType::TIDx);
     }
   }
 
-  // f4 = f0 * f1
-  // f5 = f2 - f3
-  // t2 = t1 - f4
-  // t3 = t0 + f5
-  // t4 = t3 * t2
-
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  float fl0 = 0.1;
-  float fl1 = -0.2;
-  float fl2 = 0.3;
-  float fl3 = -0.4;
-  float fl4 = fl0 * fl1;
-  float fl5 = fl2 - fl3;
-
   at::Tensor t0 = at::randn({129, 127}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.sub(fl4);
-  auto t3 = t0.add(fl5);
-  auto t4 = t3.mul(t2);
 
-  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  auto t1 = t0.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
 
-  at::Scalar test(fl0);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  fe.runFusion(
-      {t0,
-       t1,
-       at::Scalar(fl0),
-       at::Scalar(fl1),
-       at::Scalar(fl2),
-       at::Scalar(fl3)},
-      {kernel_tv4});
-
-  GpuLower gpulw(&fusion);
-  std::stringstream actual_kernel;
-  gpulw.printKernel(actual_kernel);
+  fe.runFusion({t0}, {kernel_tv5});
 
-  TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str());
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
 }
 
-void testGPU_FusionLoopUnroll() {
+// Similar to the above common consumer test but adds an additional
+// tensor that has no common consumer with the other tensors.
+TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv2 + tv3
+  // tv6 = tv1 + 6
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Set up your input tensor views
-  TensorView* tv0 = makeDummyTensor(3);
-  TensorView* tv1 = makeDummyTensor(3);
-
-  // Register your inputs
+  TensorView* tv0 = makeDummyTensor(2);
   fusion.addInput(tv0);
-  fusion.addInput(tv1);
 
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Float(2.0));
-  TensorView* tv3 = add(tv0, tv2);
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv2, new Float(-1.0));
+  TensorView* tv4 = add(tv1, new Float(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+  TensorView* tv6 = add(tv1, new Float(6.0));
 
-  // Register your outputs
-  fusion.addOutput(tv3);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
 
-  int block_size = 16;
+  TensorView* computeAtTarget = tv3;
 
-  tv3->merge(0, 1);
-  tv3->merge(0, 1);
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
 
-  tv3->split(0, block_size);
-  tv3->split(0, 4);
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
 
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
+  // This will have the same impact on the tensors except for tv5 and
+  // tv6. tv6 does not have any common consumer with the computeAt
+  // target, but since it uses tv1, it must be also computed at the
+  // same location as the other impacted tensors. We can either make
+  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
+  // should be computed at tv6 just because the current implementation
+  // orders the computeAt relationship based on the order in which
+  // tensors are specified as outputs.
 
-  // Parallelize
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (fusion.hasInput(val) ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtView() == tv2);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+
+  // tv3 and tv4 are computed at tv5
+  TORCH_CHECK(tv3->getComputeAtView() == tv5);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+
+  // tv5 should be computed at tv6 since tv5 is added as an output
+  // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be
+  // computed at tv5.
+  TORCH_CHECK(tv5->getComputeAtView() == tv6);
+  TORCH_CHECK(!tv6->hasComputeAt());
+
+  for (Val* val : fusion.vals()) {
+    if (!fusion.hasInput(val) &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = val->as<TensorView>();
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  at::Tensor input0 = at::rand({129, 13, 3}, options);
-  at::Tensor input1 = at::rand({129, 13, 3}, options);
+  at::Tensor t0 = at::randn({129, 127}, options);
+
+  auto t1 = t0.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
+  auto t6 = t1.add({6.0});
+
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+  at::Tensor kernel_tv6 = at::empty_like(t0, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input0, input1});
+  fe.runFusion({t0}, {kernel_tv5, kernel_tv6});
 
-  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+  TORCH_CHECK(at::allclose(kernel_tv6, t6));
 }
 
-/*
- * Helper function for single op testing that generates a codegen operand
- */
+// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
+// that does not have data dependency with the consumer.
+TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  // tv6 = tv1 * 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
 
-Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
-  if (desc.first == ValType::TensorView) {
-    return makeDummyTensor(2, desc.second);
-  } else if (desc.first == ValType::Scalar) {
-    if (desc.second == DataType::Float)
-      return new Float();
-    else if (desc.second == DataType::Int)
-      return new Int();
-    else
-      TORCH_CHECK("Not currently supported type", desc.first);
-  } else {
-    TORCH_CHECK("Not currently supported type", desc.first);
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, new Float(5.0));
+  // Notice that tv6 is not a consumer of tv4.
+  TensorView* tv6 = mul(tv1, new Float(6.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
   }
-  return nullptr;
-}
 
-/*
- * Helper function for single op testing that generates an ATen operand
- */
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv4);
+  TORCH_CHECK(tv3->getComputeAtView() == tv4);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+  TORCH_CHECK(tv5->getComputeAtView() == tv6);
+  TORCH_CHECK(!tv6->hasComputeAt());
 
-IValue gen_aten_operand(
-    std::pair<ValType, DataType> desc,
-    int blocks,
-    int threads,
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+  auto t6 = t1 * 6.0;
+
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+  at::Tensor kernel_tv6 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6});
+
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+  TORCH_CHECK(at::allclose(kernel_tv6, t6));
+}
+
+namespace {
+
+void checkConcretized(
+    TensorView* v0,
+    int a0,
+    TensorView* v1,
+    int a1,
+    bool should_concretize) {
+  if (should_concretize) {
+    TORCH_CHECK(
+        IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1)));
+  } else {
+    TORCH_CHECK(
+        !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1)));
+  }
+}
+
+} // namespace
+
+TEST(NVFuserTest, FusionBCastConcretizeBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // tv0: [I I]
+  TensorView* tv0 = makeDummyTensor(2);
+
+  // tv1: [I I I]
+  TensorView* tv1 = makeDummyTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // tv2*: [B I I]
+  auto tv2_0 = broadcast(tv0, {true, false, false});
+  auto tv2_1 = broadcast(tv0, {true, false, false});
+  auto tv2 = add(tv2_0, tv2_1);
+
+  // tv3: [I I I]
+  auto tv3 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+
+  checkConcretized(tv2, 0, tv1, 0, true);
+  checkConcretized(tv2_0, 0, tv1, 0, true);
+  checkConcretized(tv2_1, 0, tv1, 0, true);
+  checkConcretized(tv2_0, 1, tv1, 0, false);
+  checkConcretized(tv2_0, 0, tv1, 1, false);
+}
+
+TEST(NVFuserTest, FusionBCastConcretizeRfactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // both tv0 and tv1 = [I, I]
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+
+  //[B,I,I]
+  auto tv2 = broadcast(tv1, {true, false, false});
+
+  //[B,I,R]
+  auto tv3 = sum(tv2, {2});
+
+  auto tv5 = add(tv3, tv1);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv3->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv4 = tv3->rFactor({3});
+
+  checkConcretized(tv2, 0, tv5, 0, true);
+  checkConcretized(tv4, 0, tv5, 0, true);
+  checkConcretized(tv3, 0, tv5, 0, true);
+}
+
+namespace {
+
+void checkIdProvedEquivalent(
+    TensorView* v0,
+    int a0,
+    TensorView* v1,
+    int a1,
+    bool should_prove) {
+  if (should_prove) {
+    TORCH_CHECK(IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1)));
+  } else {
+    TORCH_CHECK(!IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1)));
+  }
+}
+
+} // namespace
+
+TEST(NVFuserTest, FusionProveIdEqBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+  TensorView* tv2 = makeDummyTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {true, false, false});
+  auto tv4 = broadcast(tv1, {false, true, false});
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  checkIdProvedEquivalent(tv0, 0, tv4, 1, true);
+  checkIdProvedEquivalent(tv1, 0, tv4, 0, true);
+  checkIdProvedEquivalent(tv1, 1, tv0, 1, true);
+  checkIdProvedEquivalent(tv0, 0, tv5, 1, true);
+  checkIdProvedEquivalent(tv1, 1, tv5, 2, true);
+  checkIdProvedEquivalent(tv0, 0, tv1, 0, false);
+  checkIdProvedEquivalent(tv0, 1, tv1, 0, false);
+  checkIdProvedEquivalent(tv0, 0, tv1, 1, false);
+}
+
+TEST(NVFuserTest, FusionProveIdEqRfactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I,I]
+  TensorView* tv0 = makeDummyTensor(2);
+  // [I,I,I]
+  TensorView* tv1 = makeDummyTensor(3);
+
+  //[I,I,R]
+  auto tv2 = sum(tv1, {2});
+
+  auto tv5 = add(tv2, tv0);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv2->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv3 = tv2->rFactor({3});
+
+  checkIdProvedEquivalent(tv1, 0, tv0, 0, true);
+  checkIdProvedEquivalent(tv2, 0, tv0, 0, true);
+  checkIdProvedEquivalent(tv3, 0, tv0, 0, true);
+}
+
+TEST(NVFuserTest, FusionScalarInputs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeDummyTensor(2);
+  fusion.addInput(tv1);
+
+  Float* f0 = new Float();
+  fusion.addInput(f0);
+  Float* f1 = new Float();
+  fusion.addInput(f1);
+  Float* f2 = new Float();
+  fusion.addInput(f2);
+  Float* f3 = new Float();
+  fusion.addInput(f3);
+  Val* f4 = mul(f0, f1);
+  Val* f5 = sub(f2, f3);
+
+  TensorView* tv2 = sub(tv1, f4);
+  TensorView* tv3 = add(tv0, f5);
+  TensorView* tv4 = mul(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  // Lets setup to actually run
+  while (tv4->nDims() > 1)
+    tv4->merge(0);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!fusion.hasInput(val) &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // f4 = f0 * f1
+  // f5 = f2 - f3
+  // t2 = t1 - f4
+  // t3 = t0 + f5
+  // t4 = t3 * t2
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  float fl0 = 0.1;
+  float fl1 = -0.2;
+  float fl2 = 0.3;
+  float fl3 = -0.4;
+  float fl4 = fl0 * fl1;
+  float fl5 = fl2 - fl3;
+
+  at::Tensor t0 = at::randn({129, 127}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.sub(fl4);
+  auto t3 = t0.add(fl5);
+  auto t4 = t3.mul(t2);
+
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+
+  at::Scalar test(fl0);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion(
+      {t0,
+       t1,
+       at::Scalar(fl0),
+       at::Scalar(fl1),
+       at::Scalar(fl2),
+       at::Scalar(fl3)},
+      {kernel_tv4});
+
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+}
+
+TEST(NVFuserTest, FusionLoopUnroll_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(3);
+  TensorView* tv1 = makeDummyTensor(3);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, new Float(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  int block_size = 16;
+
+  tv3->merge(0, 1);
+  tv3->merge(0, 1);
+
+  tv3->split(0, block_size);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::rand({129, 13, 3}, options);
+  at::Tensor input1 = at::rand({129, 13, 3}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({input0, input1});
+
+  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+}
+
+/*
+ * Helper function for single op testing that generates a codegen operand
+ */
+
+Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
+  if (desc.first == ValType::TensorView) {
+    return makeDummyTensor(2, desc.second);
+  } else if (desc.first == ValType::Scalar) {
+    if (desc.second == DataType::Float)
+      return new Float();
+    else if (desc.second == DataType::Int)
+      return new Int();
+    else
+      TORCH_CHECK("Not currently supported type", desc.first);
+  } else {
+    TORCH_CHECK("Not currently supported type", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Helper function for single op testing that generates an ATen operand
+ */
+
+IValue gen_aten_operand(
+    std::pair<ValType, DataType> desc,
+    int blocks,
+    int threads,
     bool rand) {
   if (desc.first == ValType::TensorView) {
     if (desc.second == DataType::Float) {
@@ -2012,7 +2491,7 @@ void test_op(
       gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
   std::vector<at::Tensor> output_vect = {output};
   cudaDeviceSynchronize();
-  if (fusion.hasRNG())
+  if (fusion.isStochastic())
     at::manual_seed(0);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
@@ -2020,7 +2499,7 @@ void test_op(
   fe.runFusion(aten_inputs_ivalues, output_vect);
   cudaDeviceSynchronize();
 
-  if (fusion.hasRNG())
+  if (fusion.isStochastic())
     at::manual_seed(0);
   at::Tensor ref_output = af(aten_inputs);
   cudaDeviceSynchronize(); // This sync shouldn't be necessary;
@@ -2054,12 +2533,8 @@ void test_op(
       op_str,
       " -- had a mismatch.",
       aten_inputs_to_str(),
-      "\nJIT: ",
-      output,
-      "\nREF: ",
-      ref_output,
-      "\nDIFF: ",
-      diff,
+      "\nABS MAX DIFF: ",
+      output.sub(ref_output).abs().max(),
       "\n");
 }
 
@@ -2088,7 +2563,7 @@ void test_op(
       std::make_index_sequence<size>{});
 }
 
-void testGPU_FusionUnaryOps() {
+TEST(NVFuserTest, FusionUnaryOps_CUDA) {
   using OpTuple =
       std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
 
@@ -2162,7 +2637,7 @@ void testGPU_FusionUnaryOps() {
       std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float)));
 }
 
-void testGPU_FusionBinaryOps() {
+TEST(NVFuserTest, FusionBinaryOps_CUDA) {
   using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
   using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
 
@@ -2262,7 +2737,7 @@ void testGPU_FusionBinaryOps() {
           std::make_pair(ValType::Scalar, DataType::Float)));
 }
 
-void testGPU_FusionTernaryOps() {
+TEST(NVFuserTest, FusionTernaryOps_CUDA) {
   test_op(
       /*blocks*/ 640,
       /*threads*/ 64,
@@ -2311,7 +2786,7 @@ void testGPU_FusionTernaryOps() {
           std::make_pair(ValType::TensorView, DataType::Float)));
 }
 
-void testGPU_FusionCompoundOps() {
+TEST(NVFuserTest, FusionCompoundOps_CUDA) {
   test_op(
       /*blocks*/ 640,
       /*threads*/ 64,
@@ -2350,7 +2825,7 @@ void testGPU_FusionCompoundOps() {
           std::make_pair(ValType::Scalar, DataType::Float)));
 }
 
-void testGPU_FusionCastOps() {
+TEST(NVFuserTest, FusionCastOps_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2385,20 +2860,14 @@ void testGPU_FusionCastOps() {
       "\nOp Type: -- ",
       "cast FP16->FP32->FP16",
       " -- had a mismatch.\n",
-      "IN1 : ",
-      input1,
-      "\n",
-      "JIT: ",
-      outputs[0],
-      "\n",
-      "REF: ",
-      ref_output,
+      "\nABS MAX DIFF: ",
+      outputs[0].sub(ref_output).abs().max(),
       "\n");
 }
 
 // We want split/merge/reorder all tested both on and off rfactor domains, also
 // want compute at into the rfactor domain, and into its consumer
-void testGPU_FusionRFactorReplay() {
+TEST(NVFuserTest, FusionRFactorReplay_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2491,7 +2960,7 @@ void testGPU_FusionRFactorReplay() {
 
 // Start off simple, block on the outer dim
 // block stride + thread all reduce + unrolling on inner dim
-void testGPU_FusionReduction() {
+TEST(NVFuserTest, FusionReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2549,7 +3018,7 @@ void testGPU_FusionReduction() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionReduction2() {
+TEST(NVFuserTest, FusionReduction2_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -2676,7 +3145,7 @@ void testGPU_FusionReduction2() {
   }
 }
 
-void testGPU_FusionReduction3() {
+TEST(NVFuserTest, FusionReduction3_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -2747,7 +3216,7 @@ void testGPU_FusionReduction3() {
   }
 }
 
-void testGPU_FusionReduction4() {
+TEST(NVFuserTest, FusionReduction4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2799,7 +3268,7 @@ void testGPU_FusionReduction4() {
       aten_output.sub(cg_output).abs().max());
 }
 
-void testGPU_FusionReduction5() {
+TEST(NVFuserTest, FusionReduction5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2860,7 +3329,7 @@ void testGPU_FusionReduction5() {
   TORCH_CHECK(aten_output.allclose(outputs[0]));
 }
 
-void testGPU_FusionReductionTFT() {
+TEST(NVFuserTest, FusionReductionTFT_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2917,7 +3386,7 @@ void testGPU_FusionReductionTFT() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionBranches() {
+TEST(NVFuserTest, FusionBranches_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2974,7 +3443,7 @@ void testGPU_FusionBranches() {
   TORCH_CHECK(t6.allclose(outputs[0]));
 }
 
-void testGPU_FusionSimpleBCast() {
+TEST(NVFuserTest, FusionSimpleBCast_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3238,7 +3707,7 @@ void testGPU_FusionSimpleBCast() {
   }
 }
 
-void testGPU_FusionComplexBCast() {
+TEST(NVFuserTest, FusionComplexBCast_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3341,7 +3810,7 @@ void testGPU_FusionComplexBCast() {
   }
 }
 
-void testGPU_FusionAdvancedIndexing() {
+TEST(NVFuserTest, FusionAdvancedIndexing_CUDA) {
   // Merging left to right is still broken in some instances. Indexing can't
   // complete because we assume we can simply traverse consumer->producer in the
   // index/extent map, but this case breaks this assumption.
@@ -3453,10 +3922,6 @@ void testGPU_FusionAdvancedIndexing() {
     FusionGuard fg(&fusion);
 
     int w = 3, x = 4, y = 7, z = 8;
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-    at::Tensor t0 = at::randn({x, y, z}, options);
-    at::Tensor t1 = at::randn({w, x, y, z}, options);
 
     auto tv0 = makeDummyTensor(3);
     auto tv1 = makeDummyTensor(4);
@@ -3465,10 +3930,42 @@ void testGPU_FusionAdvancedIndexing() {
 
     auto tv2 = add(tv0, new Float(1.0));
     auto tv3 = add(tv2, tv1);
-
     fusion.addOutput(tv3);
 
-    fuser::cuda::scheduleFusion(&fusion, {t0, t1});
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({x, y, z}, options);
+    at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+    fuser::cuda::scheduleFusion(&fusion, {t0, t1});
+
+    torch::jit::fuser::cuda::FusionExecutor fe;
+    fe.compileFusion(&fusion);
+    auto outputs = fe.runFusion({t0, t1});
+
+    auto t2 = t0.add(1.0);
+    auto t3 = t2.add(t1);
+
+    TORCH_CHECK(t3.allclose(outputs[0]));
+  }
+
+  {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeConcreteTensor({10, 20});
+    fusion.addInput(tv0);
+    TensorView* tv1 = makeConcreteTensor({10, 10, 20});
+    fusion.addInput(tv1);
+
+    TensorView* tv2 = add(tv0, new Float(1));
+    TensorView* tv3 = broadcast(tv2, {true, false, false});
+    TensorView* tv4 = add(tv3, tv1);
+    fusion.addOutput(tv4);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({10, 20}, options);
+    at::Tensor t1 = at::randn({10, 10, 20}, options);
 
     torch::jit::fuser::cuda::FusionExecutor fe;
     fe.compileFusion(&fusion);
@@ -3482,7 +3979,7 @@ void testGPU_FusionAdvancedIndexing() {
 }
 
 // Test a simple Gemm but also play around with fusion executor features
-void testGPU_FusionSimpleGemm() {
+TEST(NVFuserTest, FusionSimpleGemm_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3569,7 +4066,7 @@ void testGPU_FusionSimpleGemm() {
 }
 
 // Softmax with a 1D tensor. Parallelized only with a single thread block.
-void testGPU_FusionSoftmax1D() {
+TEST(NVFuserTest, FusionSoftmax1D_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3626,7 +4123,7 @@ void testGPU_FusionSoftmax1D() {
 }
 
 // Softmax with a 1D tensor with input normalization.
-void testGPU_FusionSoftmax1DNormalized() {
+TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3697,7 +4194,7 @@ void testGPU_FusionSoftmax1DNormalized() {
 
 // Softmax with a 3D tensor, where the inner-most 3rd dimension is
 // normalized. Pallelized with multiple thread blocks.
-void testGPU_FusionSoftmax3D() {
+TEST(NVFuserTest, FusionSoftmax3D_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3757,7 +4254,7 @@ void testGPU_FusionSoftmax3D() {
 }
 
 // Softmax with a 3D tensor with input normalization.
-void testGPU_FusionSoftmax3DNormalized() {
+TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3831,7 +4328,7 @@ void testGPU_FusionSoftmax3DNormalized() {
       t2.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionSoftmaxComputeAt() {
+TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3857,7 +4354,7 @@ void testGPU_FusionSoftmaxComputeAt() {
 }
 
 // Similar to FusionReduction but uses grid reduction
-void testGPU_FusionGridReduction1() {
+TEST(NVFuserTest, FusionGridReduction1_CUDA) {
   const int gdimx = 32;
   const int bdimx = 128;
 
@@ -3915,7 +4412,7 @@ void testGPU_FusionGridReduction1() {
 }
 
 // Same test as the above but uses BIDy and TIDx for reduction
-void testGPU_FusionGridReduction2() {
+TEST(NVFuserTest, FusionGridReduction2_CUDA) {
   const int gdimy = 32;
   const int bdimx = 128;
 
@@ -3970,7 +4467,7 @@ void testGPU_FusionGridReduction2() {
 }
 
 // Same test but uses BIDy and BIDz for reduction. No TID used.
-void testGPU_FusionGridReduction3dim1() {
+TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) {
   const int gdimz = 32;
   const int gdimy = 128;
 
@@ -4026,7 +4523,7 @@ void testGPU_FusionGridReduction3dim1() {
 }
 
 // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
-void testGPU_FusionGridReduction3dim0() {
+TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) {
   const int rdim = 0;
   const int gdimy = 128;
   const int gdimz = 32;
@@ -4079,7 +4576,7 @@ void testGPU_FusionGridReduction3dim0() {
 }
 
 // This is similar to the FusionReduction, but swaps BIDx and TIDx
-void testGPU_FusionGridReduction4() {
+TEST(NVFuserTest, FusionGridReduction4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4142,7 +4639,7 @@ void testGPU_FusionGridReduction4() {
 
 // Grid reduction with 2D thread blocks but only TIDx and BIDx are
 // mapped to a reduction dim
-void testGPU_FusionGridReduction5() {
+TEST(NVFuserTest, FusionGridReduction5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4194,7 +4691,7 @@ void testGPU_FusionGridReduction5() {
 }
 
 // Similar to FusionGridReduction1 but with 3D tensors
-void testGPU_FusionGridReduction6() {
+TEST(NVFuserTest, FusionGridReduction6_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4255,7 +4752,7 @@ void testGPU_FusionGridReduction6() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionNonRedAxisBind() {
+TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) {
   int bid_x = 3;
   int tid_x = 2;
   int red_dim = 0;
@@ -4290,7 +4787,7 @@ void testGPU_FusionNonRedAxisBind() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionSplitBCast() {
+TEST(NVFuserTest, FusionSplitBCast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4338,7 +4835,7 @@ void testGPU_FusionSplitBCast() {
   fe.runFusion({t0, t1}, {cg_output});
 }
 
-void testGPU_FusionBCastInnerDim() {
+TEST(NVFuserTest, FusionBCastInnerDim_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4352,7 +4849,7 @@ void testGPU_FusionBCastInnerDim() {
   TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
 }
 
-void testGPU_FusionBCastReduce() {
+TEST(NVFuserTest, FusionBCastReduce_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4368,7 +4865,7 @@ void testGPU_FusionBCastReduce() {
 
 // Multiple consumer reduction with computeAt
 // https://github.com/csarofeen/pytorch/issues/110
-void testGPU_FusionReductionMultiConsumer() {
+TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   TensorView* tv0 = makeDummyTensor(2);
@@ -4385,7 +4882,7 @@ void testGPU_FusionReductionMultiConsumer() {
       tv1->getThisComputeAtAxis() == 2 && tv1->getRelativeComputeAtAxis() == 2);
 }
 
-void testGPU_FusionComputeAtExprOrder() {
+TEST(NVFuserTest, FusionComputeAtExprOrder_CUDA) {
   {
     for (int i = 0; i < 2; ++i) {
       Fusion fusion;
@@ -4455,7 +4952,7 @@ void testGPU_FusionComputeAtExprOrder() {
   }
 }
 
-void testGPU_FusionZeroDimComputeAt() {
+TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4482,7 +4979,7 @@ void testGPU_FusionZeroDimComputeAt() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionZeroDimBroadcast() {
+TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4518,7 +5015,7 @@ void testGPU_FusionZeroDimBroadcast() {
       aten_output.sub(output).abs().max());
 }
 
-void testGPU_FusionZeroDimReduction() {
+TEST(NVFuserTest, FusionZeroDimReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4555,7 +5052,7 @@ void testGPU_FusionZeroDimReduction() {
       aten_output.sub(output).abs().max());
 }
 
-void testGPU_FusionBCastAfterReduce() {
+TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   const int tidx = 128;
@@ -4606,7 +5103,7 @@ void testGPU_FusionBCastAfterReduce() {
   TORCH_CHECK(t5.allclose(outputs[0], 1e-5, 1e-5));
 }
 
-void testGPU_FusionReductionScheduler() {
+TEST(NVFuserTest, FusionReductionScheduler_CUDA) {
   constexpr int bid_x = 80;
   constexpr int tid_x = 4096;
   constexpr int red_dim = 1;
@@ -4624,29 +5121,27 @@ void testGPU_FusionReductionScheduler() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand({bid_x, tid_x}, options);
+  at::Tensor input = at::randn({bid_x, tid_x}, options);
 
   // Apply reduction heuristic
-  const at::ArrayRef<c10::IValue> inputs({input});
-
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, inputs, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
   // no broadcasting needed, omitting the last optional argument;
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
   auto aten_output = input.sum({red_dim});
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
 
 // Simple reduction parallelized on a symbolic size.
-void testGPU_FusionSymbolicReduction() {
+TEST(NVFuserTest, FusionSymbolicReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4685,9 +5180,9 @@ void testGPU_FusionSymbolicReduction() {
   // How many threads to use for the block reduction
   int runtime_threadIdx_dim = 128;
 
-  torch::jit::fuser::cuda::FusionExecutor executor;
-  executor.compileFusion(&fusion);
-  auto outputs = executor.runFusion(
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
       {input},
       torch::jit::fuser::cuda::LaunchParams(
           -1, -1, -1, runtime_threadIdx_dim, -1, -1));
@@ -4696,7 +5191,7 @@ void testGPU_FusionSymbolicReduction() {
   TORCH_CHECK(aten_output.allclose(outputs[0]));
 }
 
-void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
+TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
   const std::vector<int> red_dims = {0, 2};
   // Copy is because CodeGen requires int and Pytorch requires int64_t
   // for a vector of reduction dimensions
@@ -4716,29 +5211,27 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
   at::Tensor cg_output = at::empty(tensor_dims_out, options);
 
   // Apply reduction heuristic
-  const at::ArrayRef<c10::IValue> inputs({input});
-
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, inputs, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
 
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionReductionSchedulerMultiDimFastest() {
+TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
   const std::vector<int> red_dims = {1, 3};
   // Copy is because CodeGen requires int and Pytorch requires int64_t
   // for a vector of reduction dimensions
@@ -4758,26 +5251,26 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
 
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, {input}, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
 
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-05, 1e-05),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionReductionSchedulerDimShmoo() {
-  std::vector<bool> fp16_usage = {false};
+TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
+  std::vector<bool> fp16_usage = {true, false};
   std::vector<int> red_axis = {1, 0};
   std::vector<int> output_dims = {320, 640};
   std::vector<int> red_dims;
@@ -4821,47 +5314,38 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
                              .dtype((fp16 ? at::kHalf : at::kFloat))
                              .device(at::kCUDA, 0);
           at::Tensor input =
-              (axis ? at::rand({odim, rdim}, options)
-                    : at::rand({rdim, odim}, options));
-
-          const at::ArrayRef<c10::IValue> inputs({input});
+              (axis ? at::randn({odim, rdim}, options)
+                    : at::randn({rdim, odim}, options));
 
-          c10::optional<cuda::ReductionParams> rparams =
-              cuda::scheduleReduction(&fusion, inputs, tv1);
-          TORCH_CHECK(rparams != c10::nullopt, "Reduction is not found!");
+          std::vector<TensorView*> outputs_of_red;
           if (fp16) {
-            if (axis == 0) {
-              int tidx = rparams.value().lparams.bdimx();
-              tv1_cast->split(-1, tidx);
-              tv1_cast->axis(-1)->parallelize(ParallelType::TIDx);
-              tv1_cast->axis(-2)->parallelize(ParallelType::BIDx);
-            } else {
-              if (rparams.value().mul_reds_per_blk) {
-                int tidy = rparams.value().lparams.bdimy();
-                tv1_cast->split(0, tidy);
-                tv1_cast->axis(-1)->parallelize(ParallelType::TIDy);
-              }
-              tv1_cast->axis(0)->parallelize(ParallelType::BIDx);
-            }
+            outputs_of_red.push_back(tv1_cast);
           }
 
+          auto reduction_params =
+              cuda::getReductionHeuristics(&fusion, {input}, tv1);
+          TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
+          cuda::scheduleReduction(
+              &fusion, reduction_params.value(), tv1, outputs_of_red);
+
           torch::jit::fuser::cuda::FusionExecutor fe;
           fe.compileFusion(&fusion);
 
-          auto cg_output = fe.runFusion({input});
+          auto outputs =
+              fe.runFusion({input}, reduction_params.value().lparams);
           auto aten_output = input.sum({axis});
 
           TORCH_CHECK(
-              aten_output.allclose(cg_output[0]),
+              aten_output.allclose(outputs[0], 1e-03, 1e-03),
               "Error of: ",
-              aten_output.sub(cg_output[0]).abs().max());
+              aten_output.sub(outputs[0]).abs().max());
         }
       }
     }
   }
 }
 
-void testGPU_FusionCacheBefore() {
+TEST(NVFuserTest, FusionCacheBefore_CUDA) {
   // TVM Cache Write
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -4902,7 +5386,7 @@ void testGPU_FusionCacheBefore() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheAfter() {
+TEST(NVFuserTest, FusionCacheAfter_CUDA) {
   // TVM Cache Read
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -4943,7 +5427,7 @@ void testGPU_FusionCacheAfter() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheIndirect() {
+TEST(NVFuserTest, FusionCacheIndirect_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4992,7 +5476,7 @@ void testGPU_FusionCacheIndirect() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheBcast() {
+TEST(NVFuserTest, FusionCacheBcast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5051,7 +5535,7 @@ void testGPU_FusionCacheBcast() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionCacheComplex() {
+TEST(NVFuserTest, FusionCacheComplex_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5106,7 +5590,7 @@ void testGPU_FusionCacheComplex() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheMultiConsumer() {
+TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5151,7 +5635,7 @@ void testGPU_FusionCacheMultiConsumer() {
       aten_output.sub(outputs[1]).abs().sum());
 }
 
-void testGPU_FusionSmem() {
+TEST(NVFuserTest, FusionSmem_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5203,16 +5687,269 @@ void testGPU_FusionSmem() {
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+TEST(NVFuserTest, FusionSmemReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeDummyTensor(3); // M, K, N
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cache_after();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, 128);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0});
+
+  at::Tensor aten_output = sum(t0, {1});
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
+}
+
+TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeDummyTensor(2); // (M, K)
+  TensorView* tv1 = makeDummyTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  constexpr int BSX = 16;
+  tv5->split(2, BSX);
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv6 = tv5->rFactor({-1});
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv5, 3);
+  tv1->computeAt(tv5, 3);
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-3)->parallelize(ParallelType::TIDy);
+  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0, t1});
+
+  at::Tensor aten_output = matmul(t0, t1);
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
-void testGPU_FusionSmemReduce() {
+TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
   // Algorithm
+  TensorView* tv0 = makeDummyTensor(2); // (M, K)
+  TensorView* tv1 = makeDummyTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  // Remove reduction axis from tv5
+  // tv6 = (M, R, N)
+  // tv5 = (M, N)
+  TensorView* tv6 = tv5->cache_before();
+
+  constexpr int BSX = 16;
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // tv5 = M/BSX, N/BSX, MSX, NSX
+
+  tv6->computeAt(tv5, 2);
+  tv6->computeAt(tv5, 2);
+
+  tv6->split(-1, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv7 = tv6->rFactor({-1});
+  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
+  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv0->computeAt(tv7, 3);
+  tv1->computeAt(tv7, 3);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+  tv7->setMemoryType(MemoryType::Shared);
+  // Memory Type
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv7->axis(-3)->parallelize(ParallelType::TIDy);
+  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv6->axis(-2)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0, t1});
+
+  at::Tensor aten_output = matmul(t0, t1);
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Shared);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::rand({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
+      {input},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
+
+  auto aten_output = input.sum({1});
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  Int* sym_bsx = new Int();
   TensorView* tv0 = makeDummyTensor(3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(sym_bsx);
+
   TensorView* tv1 = sum(tv0, {1}); // M, R, N
-  fusion.addInput(tv0);
   fusion.addOutput(tv1);
 
   TensorView* tv2 = tv0->cache_after();
@@ -5221,7 +5958,7 @@ void testGPU_FusionSmemReduce() {
   // Schedule
   constexpr int BSX = 32;
   tv1->split(2, BSX);
-  tv1->split(1, 128);
+  tv1->split(1, sym_bsx);
   tv1->split(0, BSX);
   // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
   tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
@@ -5243,63 +5980,64 @@ void testGPU_FusionSmemReduce() {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({M, K, N}, options);
 
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
+  auto outputs = fe.runFusion(
+      {t0, runtime_threadIdx_dim},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
 
   at::Tensor aten_output = sum(t0, {1});
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
 }
 
-void testGPU_FusionSmemBlockGemm() {
+TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Algorithm
+  Int* sym_bsx = new Int();
   TensorView* tv0 = makeDummyTensor(2); // (M, K)
   TensorView* tv1 = makeDummyTensor(2); // (K, N)
   TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
   TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
   TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
   fusion.addInput(tv0);
   fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  constexpr int BSX = 16;
-  tv5->split(2, BSX);
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv6 = tv5->rFactor({-1});
+  fusion.addInput(sym_bsx);
+  fusion.addOutput(tv4);
+  // Algorithm
 
   tv2->setMemoryType(MemoryType::Shared);
   tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
 
-  tv0->computeAt(tv5, 3);
-  tv1->computeAt(tv5, 3);
+  constexpr int BSX = 32;
+  tv4->split(2, BSX);
+  tv4->split(1, sym_bsx);
+  tv4->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
 
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv4, 3);
+  tv1->computeAt(tv4, 3);
+  // Schedule
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
   // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-3)->parallelize(ParallelType::TIDy);
-  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+  // Thread and Block binding
 
-  constexpr int M = 154, K = 45, N = 1524;
+  constexpr int M = 128, K = 457, N = 1024;
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({M, K}, options);
@@ -5307,103 +6045,234 @@ void testGPU_FusionSmemBlockGemm() {
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0, t1});
+  auto outputs = fe.runFusion(
+      {t0, t1, BSX},
+      torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1));
 
-  at::Tensor aten_output = matmul(t0, t1);
+  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1);
 }
 
-void testGPU_FusionSmemBlockGemmCache() {
-#if 0
+TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Algorithm
-  TensorView* tv0 = makeDummyTensor(2); // (M, K)
-  TensorView* tv1 = makeDummyTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z
+  Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x
+  Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x
+  // Compile-time integer for tiling
+  int n_smem_tile = 8; // bound to threadIdx.y
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Turn the K-dimension of tv4 into a reduction dimension
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
   fusion.addInput(tv0);
   fusion.addInput(tv1);
   fusion.addOutput(tv5);
 
-  // Schedule
-  // Remove reduction axis from tv5
-  // tv6 = (M, R, N)
-  // tv5 = (M, N)
-  TensorView* tv6 = tv5->cache_before();
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
 
-  constexpr int BSX = 16;
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // tv5 = M/BSX, N/BSX, MSX, NSX
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
 
-  tv6->computeAt(tv5, 2);
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  tv5->reorder({{1, 5}, {5, 1}});
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  auto tv6 = tv5->rFactor({2});
+
+  // Scope computations
   tv6->computeAt(tv5, 2);
 
-  tv6->split(-1, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv7 = tv6->rFactor({-1});
-  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
-  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+  // RFactor moves reduction axes around, reorder to match ordering of tv5
+  tv6->reorder({
+      {2, -2},
+      {3, -1},
+      {4, 2},
+      {5, 3},
+      {6, 4},
+  });
 
+  // Setup compute at schedule
   tv0->computeAt(tv6, 3);
   tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+  //
+  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
+  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
+  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
+  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
+  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
 
-  tv0->computeAt(tv7, 3);
-  tv1->computeAt(tv7, 3);
-
+  // Cache smem tiles
   tv2->setMemoryType(MemoryType::Shared);
   tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-  tv7->setMemoryType(MemoryType::Shared);
-  // Memory Type
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
 
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
   tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
 
-  tv7->axis(-3)->parallelize(ParallelType::TIDy);
-  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
 
-  tv6->axis(-2)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv5->axis(3)->parallelize(ParallelType::BIDx);
 
-  constexpr int M = 154, K = 45, N = 1524;
+  constexpr int M = 31, K = 65, N = 33;
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor A = at::randn({M, K}, options);
+  at::Tensor B = at::randn({K, N}, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
+  // Generate CUDA and compile with nvRTC
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0, t1});
 
-  at::Tensor aten_output = matmul(t0, t1);
+  // Runtime tiling
+  int m_tile = 4; // bound to threadIdx.z
+  int split_k = 7; // bound to blockIdx.x
+  int intra_cta = 8; // bound to threadIdx.x
+
+  auto fuser_outputs = fe.runFusion({A, B, m_tile, split_k, intra_cta});
+  auto C_fuser = fuser_outputs[0];
+
+  at::Tensor aten_C = mul(A.unsqueeze(2), B.unsqueeze(0)).sum(1);
+  TORCH_CHECK(
+      aten_C.allclose(C_fuser, 1e-5, 1e-5),
+      "Error of: ",
+      aten_C.sub(C_fuser).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1);
+}
+
+TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Global);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::rand({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
+      {input},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
+
+  auto aten_output = input.sum({1});
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
-#endif
 }
 
-void testGPU_FusionConstCheck() {
+TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+  TensorView* tv2 = makeDummyTensor(2);
+  TensorView* tv3 = makeDummyTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv4->setMemoryType(MemoryType::Global);
+  tv5->setMemoryType(MemoryType::Global);
+  tv6->setMemoryType(MemoryType::Global);
+
+  constexpr int M = 32, N = 810;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor in0 = at::rand({M, N}, options);
+  at::Tensor in1 = at::rand({M, N}, options);
+  at::Tensor in2 = at::rand({M, N}, options);
+  at::Tensor in3 = at::rand({M, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({in0, in1, in2, in3});
+
+  at::Tensor aten_output = (in1 + (in2 - in3)) - in0;
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().sum());
+}
+
+TEST(NVFuserTest, FusionConstCheck_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5420,7 +6289,7 @@ void testGPU_FusionConstCheck() {
   TORCH_CHECK(one_x4->isConstScalar());
 }
 
-void testGPU_FusionUnrollWithAlloc() {
+TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
   const std::vector<int64_t> tensor_dims_in = {128, 128};
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -5468,7 +6337,7 @@ void testGPU_FusionUnrollWithAlloc() {
 }
 
 // Test isZeroInt
-void testGPU_FusionIsZeroInt() {
+TEST(NVFuserTest, FusionIsZeroInt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5481,7 +6350,7 @@ void testGPU_FusionIsZeroInt() {
 }
 
 // Test isOneInt
-void testGPU_FusionIsOneInt() {
+TEST(NVFuserTest, FusionIsOneInt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5496,7 +6365,7 @@ void testGPU_FusionIsOneInt() {
 // This is to verify no cycle of computeAt is created. A more complex
 // variation of this pattern appears in one of the Python tests
 // (test_random_topo).
-void testGPU_FusionComputeAtNonterminatingOutput() {
+TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5560,7 +6429,7 @@ void testGPU_FusionComputeAtNonterminatingOutput() {
   return;
 }
 
-void testGPU_FusionTraversalOrder1() {
+TEST(NVFuserTest, FusionTraversalOrder1_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5608,7 +6477,7 @@ void testGPU_FusionTraversalOrder1() {
       t4.sub(cg_output_tv4).abs().max());
 }
 
-void testGPU_FusionTraversalOrder2() {
+TEST(NVFuserTest, FusionTraversalOrder2_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5661,7 +6530,7 @@ void testGPU_FusionTraversalOrder2() {
       t5.sub(cg_output_tv5).abs().max());
 }
 
-void testGPU_FusionTraversalOrder3() {
+TEST(NVFuserTest, FusionTraversalOrder3_CUDA) {
   for (int i = 0; i < 2; ++i) {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -5729,7 +6598,7 @@ void testGPU_FusionTraversalOrder3() {
   }
 }
 
-void testGPU_FusionTraversalOrder4() {
+TEST(NVFuserTest, FusionTraversalOrder4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5793,7 +6662,7 @@ void testGPU_FusionTraversalOrder4() {
       t7.sub(cg_output_tv7).abs().max());
 }
 
-void testGPU_FusionTraversalOrder5() {
+TEST(NVFuserTest, FusionTraversalOrder5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5843,7 +6712,7 @@ void testGPU_FusionTraversalOrder5() {
       t5.sub(cg_output_tv5).abs().max());
 }
 
-void testGPU_FusionTraversalOrder6() {
+TEST(NVFuserTest, FusionTraversalOrder6_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5885,7 +6754,7 @@ void testGPU_FusionTraversalOrder6() {
       t4.sub(cg_output_tv4).abs().max());
 }
 
-void testGPU_FusionTraversalOrder7() {
+TEST(NVFuserTest, FusionTraversalOrder7_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5934,7 +6803,7 @@ void testGPU_FusionTraversalOrder7() {
 }
 
 // Test predication of grid reduction
-void testGPU_FusionThreadPredicate() {
+TEST(NVFuserTest, FusionThreadPredicate_CUDA) {
   const int gdimx = 4;
   const int bdimx = 128;
 
@@ -5990,6 +6859,195 @@ void testGPU_FusionThreadPredicate() {
   TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3));
 }
 
+TEST(NVFuserTest, FusionLSTMCell_CUDA) {
+  const int hidden_features = 512;
+  const int batch_size = 64;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tvs[16];
+  for (size_t i = 0; i < 16; i++) {
+    tvs[i] = makeDummyTensor(2);
+    fusion.addInput(tvs[i]);
+  }
+
+  auto ingate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
+
+  auto forgetgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
+
+  auto cellgate = unaryOp(
+      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
+
+  auto outgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
+
+  auto cx = makeContigTensor(2);
+  fusion.addInput(cx);
+
+  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
+
+  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
+
+  fusion.addOutput(cy);
+  fusion.addOutput(hy);
+
+  std::vector<c10::IValue> inputs;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor large_tensor0 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor1 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor2 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor3 =
+      at::randn({batch_size, hidden_features * 4}, options);
+
+  auto chunked0 = large_tensor0.chunk(4, 1);
+  auto chunked1 = large_tensor1.chunk(4, 1);
+  auto chunked2 = large_tensor2.chunk(4, 1);
+  auto chunked3 = large_tensor3.chunk(4, 1);
+
+  inputs.insert(inputs.end(), chunked0.begin(), chunked0.end());
+  inputs.insert(inputs.end(), chunked1.begin(), chunked1.end());
+  inputs.insert(inputs.end(), chunked2.begin(), chunked2.end());
+  inputs.insert(inputs.end(), chunked3.begin(), chunked3.end());
+
+  auto at_ingate =
+      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
+  auto at_forgetgate =
+      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
+  auto at_cellgate =
+      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
+  auto at_outgate =
+      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
+
+  auto at_cx = at::randn({batch_size, hidden_features}, options);
+  inputs.push_back(at_cx);
+  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
+  auto at_hy = at_outgate.mul(at_cy.tanh());
+
+  fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef<c10::IValue>(inputs));
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(c10::ArrayRef<c10::IValue>(inputs));
+
+  TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7));
+  TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7));
+}
+
+TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = broadcast(tv1, {true, false});
+  TensorView* tv3 = broadcast(tv1, {false, true});
+  TensorView* tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // This is not supported and should throw an exception.
+  ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
+}
+
+TEST(NVFuserTest, FusionReductionHalf_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(3, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = add(tv1, new Float(1.0));
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = castOp(DataType::Half, tv3);
+
+  fusion.addOutput(tv4);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({8, 8, 16}, options);
+
+  auto reduction_tv = tv3;
+
+  auto outputsOfReduction = DependencyCheck::getAllOutputsOf({reduction_tv});
+
+  // Grab only tensor views, though there shouldn't be any other type
+  auto tv_entries = ir_utils::filterByType<TensorView>(outputsOfReduction);
+
+  std::vector<TensorView*> tvOutputsOfReduction(
+      tv_entries.begin(), tv_entries.end());
+
+  auto reduction_params =
+      cuda::getReductionHeuristics(&fusion, {input}, reduction_tv);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(
+      &fusion, reduction_params.value(), reduction_tv, tvOutputsOfReduction);
+
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  // no broadcasting needed, omitting the last optional argument;
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
+
+  auto aten_output = input.to(c10::ScalarType::Float)
+                         .add(1.0)
+                         .sum({2})
+                         .to(c10::ScalarType::Half);
+
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+}
+
+TEST(NVFuserTest, FusionInputsIdLookup_CUDA) {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 8, 8}, options);
+  at::Tensor t1 = at::randn({8, 8}, options);
+  at::Tensor t2 = at::randn({6, 4}, options);
+
+  // create a cache with max size 2;
+  auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2);
+
+  // testing basic function, same encoding for identical inputs
+  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
+  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
+  TORCH_CHECK(id_0.id == id_0_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 1);
+  TORCH_CHECK(id_0.eviction == false);
+
+  // new input (even tho same shape, but we have different signature because of
+  // missing scalar input
+  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
+  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1.id == id_1_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_1.eviction == false);
+
+  // eviction should happen at this point
+  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
+  TORCH_CHECK(id_2.id != id_0.id);
+  TORCH_CHECK(id_2.id != id_1.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_2.eviction == true);
+  TORCH_CHECK(id_2.evict_id == id_0.id);
+
+  // look at input 1 again
+  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1_relook.id == id_1.id);
+  TORCH_CHECK(id_1_relook.eviction == false);
+}
+
 } // namespace jit
 } // namespace torch
 
diff --git a/test/cpp/jit/test_graph_executor.cpp b/test/cpp/jit/test_graph_executor.cpp
index 992cde217a90..923e3421738b 100644
--- a/test/cpp/jit/test_graph_executor.cpp
+++ b/test/cpp/jit/test_graph_executor.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/runtime/graph_executor.h"
 
 namespace torch {
 namespace jit {
 
-void testGraphExecutor() {
+TEST(GraphExecutorTest, Basic_CUDA) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
 
diff --git a/test/cpp/jit/test_inliner.cpp b/test/cpp/jit/test_inliner.cpp
index 2153a0389319..702f5bd97573 100644
--- a/test/cpp/jit/test_inliner.cpp
+++ b/test/cpp/jit/test_inliner.cpp
@@ -1,4 +1,4 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/api/module.h>
@@ -36,18 +36,16 @@ struct InlinerGuard {
   bool oldState_;
 };
 
-void testInliner() {
-  {
-    // disable automatic inlining so we can test it manually
-    InlinerGuard guard(/*shouldInline=*/false);
+TEST(InlinerTest, Basic) {
+  // disable automatic inlining so we can test it manually
+  InlinerGuard guard(/*shouldInline=*/false);
 
-    CompilationUnit cu(testSource);
-    auto& fn = cu.get_function("foo3");
+  CompilationUnit cu(testSource);
+  auto& fn = cu.get_function("foo3");
 
-    auto g = fn.graph();
-    Inline(*g);
-    FileCheck().check_count("prim::Print", 3)->run(*g);
-  }
+  auto g = fn.graph();
+  Inline(*g);
+  FileCheck().check_count("prim::Print", 3)->run(*g);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_interface.cpp b/test/cpp/jit/test_interface.cpp
index b256e2328ceb..04a532459426 100644
--- a/test/cpp/jit/test_interface.cpp
+++ b/test/cpp/jit/test_interface.cpp
@@ -1,5 +1,5 @@
+#include <gtest/gtest.h>
 
-#include <test/cpp/jit/test_base.h>
 #include <test/cpp/jit/test_utils.h>
 
 #include <ATen/core/qualified_name.h>
@@ -44,7 +44,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testModuleInterfaceSerialization() {
+TEST(InterfaceTest, ModuleInterfaceSerialization) {
   auto cu = std::make_shared<CompilationUnit>();
   Module parentMod("parentMod", cu);
   Module subMod("subMod", cu);
diff --git a/test/cpp/jit/test_interpreter.cpp b/test/cpp/jit/test_interpreter.cpp
index 5977b0c0494a..da4607d7f047 100644
--- a/test/cpp/jit/test_interpreter.cpp
+++ b/test/cpp/jit/test_interpreter.cpp
@@ -1,12 +1,18 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
-#include <stdexcept>
 namespace torch {
 namespace jit {
 
-void testTypeCheck() {
-  {
+class TypeCheckTest : public ::testing::Test {
+ protected:
+  TypeCheckTest() : interp(makeInterp()) {}
+
+  InterpreterState interp;
+
+ private:
+  static InterpreterState makeInterp() {
     auto graph = std::make_shared<Graph>();
     std::unordered_map<std::string, Value*> vmap;
     parseIR(
@@ -20,88 +26,97 @@ graph(%a.1 : Tensor,
         vmap);
 
     Code function(graph, "");
-    InterpreterState interp(function);
-    {
-      // TypeCheck yields to true! Shape, grad and device matches.
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a.set_requires_grad(true);
-      a = a.to(at::kCPU);
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a));
-      ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b));
-      ASSERT_TRUE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({2, 2}, at::kFloat); // Size mismatch
-      a.set_requires_grad(true);
-      a = a.to(at::kCPU);
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a = a.to(at::kCPU);
-      a.set_requires_grad(false); // Gradient mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a = a.to(at::kCPU);
-      a.set_requires_grad(true);
-      a = a.to(at::kInt); // Scalar type mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a.set_requires_grad(true);
-      a = a.to(at::kCUDA); // Device mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
+    return InterpreterState(function);
   }
+};
 
-  try { // Test empty Typecheck raises an internal assertion
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
-graph(%a.1 : Tensor,
-      %b.1 : Tensor):
-  %type_matched : bool = prim::TypeCheck()
-  return (%type_matched)
-  )IR",
-        &*graph,
-        vmap);
-  } catch (const std::exception& e) {
-  }
-  try { // Test for assertion if num_inputs + 1 != num_outputs
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
-graph(%a.1 : Tensor,
-      %b.1 : Tensor):
-  %type_matched : bool = prim::TypeCheck(%a.1)
-  return (%type_matched)
-  )IR",
-        &*graph,
-        vmap);
-  } catch (const std::exception& e) {
-  }
+TEST_F(TypeCheckTest, MatchingType) {
+  // TypeCheck yields to true! Shape, grad and device matches.
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a.set_requires_grad(true);
+  a = a.to(at::kCPU);
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a));
+  ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b));
+  ASSERT_TRUE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, SizeMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({2, 2}, at::kFloat); // Size mismatch
+  a.set_requires_grad(true);
+  a = a.to(at::kCPU);
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
 }
-void testInterp() {
+
+TEST_F(TypeCheckTest, GradientMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a = a.to(at::kCPU);
+  a.set_requires_grad(false); // Gradient mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, ScalarTypeMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a = a.to(at::kCPU);
+  a.set_requires_grad(true);
+  a = a.to(at::kInt); // Scalar type mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, DeviceMismatch_CUDA) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a.set_requires_grad(true);
+  a = a.to(at::kCUDA); // Device mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+// TODO: These tests weren't doing anything.
+// TEST(TypeCheckErrorTest, EmptyCheckRaises) {
+//   // Test empty Typecheck raises an internal assertion
+//   auto graph = std::make_shared<Graph>();
+//   std::unordered_map<std::string, Value*> vmap;
+//   EXPECT_ANY_THROW(parseIR(
+//       R"IR(
+// graph(%a.1 : Tensor,
+//       %b.1 : Tensor):
+//   %type_matched : bool = prim::TypeCheck()
+//   return (%type_matched)
+//   )IR",
+//       &*graph,
+//       vmap));
+// }
+
+// TODO: These tests weren't doing anything.
+// TEST(TypeCheckErrorTest, WrongInputOutputCountRaises) {
+//   // Test for assertion if num_inputs + 1 != num_outputs
+//   auto graph = std::make_shared<Graph>();
+//   std::unordered_map<std::string, Value*> vmap;
+//   EXPECT_ANY_THROW(parseIR(
+//       R"IR(
+// graph(%a.1 : Tensor,
+//       %b.1 : Tensor):
+//   %type_matched : bool = prim::TypeCheck(%a.1)
+//   return (%type_matched)
+//   )IR",
+//       &*graph,
+//       vmap));
+// }
+
+TEST(InterpreterTest, Basic_CUDA) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
   constexpr int seq_len = 32;
diff --git a/test/cpp/jit/test_ir.cpp b/test/cpp/jit/test_ir.cpp
index a05ff70061bf..2423bbf0c773 100644
--- a/test/cpp/jit/test_ir.cpp
+++ b/test/cpp/jit/test_ir.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/ir/irparser.h"
 
 namespace torch {
 namespace jit {
 
-void testAttributes() {
+TEST(IRTest, Attributes) {
   Graph g;
   auto one = attr::alpha;
   auto two = attr::device;
@@ -33,7 +34,7 @@ void testAttributes() {
   ASSERT_EQ(attr2.f(one), 5);
 }
 
-void testBlocks() {
+TEST(IRTest, Blocks) {
   auto g = std::make_shared<Graph>();
   const auto graph_string = R"IR(
     graph(%a : Tensor,
@@ -92,7 +93,7 @@ void testBlocks() {
       ->run(*g2);
 }
 
-void testCommonAncestor() {
+TEST(IRTest, CommonAncestor) {
   std::string input_str = R"(
 graph(%x : Tensor,
       %a.1 : bool,
diff --git a/test/cpp/jit/test_irparser.cpp b/test/cpp/jit/test_irparser.cpp
index a71b64a7b85b..57f21f5bf5f9 100644
--- a/test/cpp/jit/test_irparser.cpp
+++ b/test/cpp/jit/test_irparser.cpp
@@ -1,7 +1,8 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -38,52 +39,52 @@ static void checkRoundtrip(const std::string& s) {
   AT_ASSERT(original == parsed);
 }
 
-void testIRParser() {
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
+TEST(IRParserTest, Basic) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(
+      R"IR(
 graph(%0 : Tensor, %1 : Tensor):
   %2 : Tensor = foo::add(%0, %1)
   %res, %3 = foo::mul(%0, %2)
   %x, %y = foo::combine(%res, %2, %3)
   return (%x, %y, %res))IR",
-        &*graph,
-        vmap);
+      &*graph,
+      vmap);
 
-    AT_ASSERT(graph->inputs().size() == 2);
-    AT_ASSERT(graph->outputs().size() == 3);
-    Value* x = graph->outputs()[0];
-    Value* y = graph->outputs()[1];
-    Value* res = graph->outputs()[2];
-    Value* t0 = graph->inputs()[0];
-    Value* t1 = graph->inputs()[1];
-    AT_ASSERT(vmap["x"] == x);
-    AT_ASSERT(vmap["y"] == y);
-    AT_ASSERT(vmap["res"] == res);
-    AT_ASSERT(vmap["0"] == t0);
-    AT_ASSERT(vmap["1"] == t1);
-    AT_ASSERT(x->node() == y->node());
-    Node* comb = x->node();
-    Value* t2 = comb->inputs()[1];
-    Value* t3 = comb->inputs()[2];
-    AT_ASSERT(vmap["2"] == t2);
-    AT_ASSERT(vmap["3"] == t3);
-    AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine"));
-    AT_ASSERT(comb->outputs() == std::vector<Value*>({x, y}));
-    AT_ASSERT(comb->inputs() == std::vector<Value*>({res, t2, t3}));
-    Node* mul = res->node();
-    AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul"));
-    AT_ASSERT(mul->inputs() == std::vector<Value*>({t0, t2}));
-    AT_ASSERT(mul->outputs() == std::vector<Value*>({res, t3}));
-    Node* add = t2->node();
-    AT_ASSERT(add->kind().toQualString() == std::string("foo::add"));
-    AT_ASSERT(add->inputs() == std::vector<Value*>({t0, t1}));
-    AT_ASSERT(add->outputs() == std::vector<Value*>({t2}));
-  }
-  {
-    checkRoundtrip(R"IR(
+  AT_ASSERT(graph->inputs().size() == 2);
+  AT_ASSERT(graph->outputs().size() == 3);
+  Value* x = graph->outputs()[0];
+  Value* y = graph->outputs()[1];
+  Value* res = graph->outputs()[2];
+  Value* t0 = graph->inputs()[0];
+  Value* t1 = graph->inputs()[1];
+  AT_ASSERT(vmap["x"] == x);
+  AT_ASSERT(vmap["y"] == y);
+  AT_ASSERT(vmap["res"] == res);
+  AT_ASSERT(vmap["0"] == t0);
+  AT_ASSERT(vmap["1"] == t1);
+  AT_ASSERT(x->node() == y->node());
+  Node* comb = x->node();
+  Value* t2 = comb->inputs()[1];
+  Value* t3 = comb->inputs()[2];
+  AT_ASSERT(vmap["2"] == t2);
+  AT_ASSERT(vmap["3"] == t3);
+  AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine"));
+  AT_ASSERT(comb->outputs() == std::vector<Value*>({x, y}));
+  AT_ASSERT(comb->inputs() == std::vector<Value*>({res, t2, t3}));
+  Node* mul = res->node();
+  AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul"));
+  AT_ASSERT(mul->inputs() == std::vector<Value*>({t0, t2}));
+  AT_ASSERT(mul->outputs() == std::vector<Value*>({res, t3}));
+  Node* add = t2->node();
+  AT_ASSERT(add->kind().toQualString() == std::string("foo::add"));
+  AT_ASSERT(add->inputs() == std::vector<Value*>({t0, t1}));
+  AT_ASSERT(add->outputs() == std::vector<Value*>({t2}));
+}
+
+TEST(IRParserTest, NestedBlock) {
+  checkRoundtrip(R"IR(
 graph():
   %0 : Tensor = a::a()
     block0():
@@ -95,9 +96,10 @@ graph():
   %3 : Tensor = d::d()
   return (%3)
 )IR");
-  }
-  {
-    checkRoundtrip(R"IR(
+}
+
+TEST(IRParserTest, If) {
+  checkRoundtrip(R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -114,9 +116,10 @@ graph(%0 : Tensor,
   %11 : Tensor = aten::add(%5, %3, %10)
   return (%11)
 )IR");
-  }
-  {
-    checkRoundtrip(R"IR(
+}
+
+TEST(IRParserTest, If2) {
+  checkRoundtrip(R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -133,40 +136,43 @@ graph(%0 : Tensor,
   %11 : Tensor = aten::add(%5, %3, %10)
   return (%11)
 )IR");
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+}
+
+TEST(IRParserTest, InferredTypeIsTensor) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%a):
   return (%a))IR",
-        &*graph);
-    AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
-  }
-  {
-    // Check that parser correctly handles values reusing the same name.
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
+}
+
+TEST(IRParserTest, ValueReuse) {
+  // Check that parser correctly handles values reusing the same name.
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%x):
   %x = a::a(%x)
   %x = b::b(%x)
   return (%x))IR",
-        &*graph);
-    Value* x0 = graph->inputs()[0];
-    Value* x2 = graph->outputs()[0];
-    Node* b = x2->node();
-    Value* x1 = b->inputs()[0];
-    Node* a = x1->node();
-    AT_ASSERT(a->inputs() == std::vector<Value*>({x0}));
-    AT_ASSERT(a->outputs() == std::vector<Value*>({x1}));
-    AT_ASSERT(b->inputs() == std::vector<Value*>({x1}));
-    AT_ASSERT(b->outputs() == std::vector<Value*>({x2}));
-  }
-  {
-    // Check that parser handles attributes and types.
-    checkRoundtrip(
-        R"IR(
+      &*graph);
+  Value* x0 = graph->inputs()[0];
+  Value* x2 = graph->outputs()[0];
+  Node* b = x2->node();
+  Value* x1 = b->inputs()[0];
+  Node* a = x1->node();
+  AT_ASSERT(a->inputs() == std::vector<Value*>({x0}));
+  AT_ASSERT(a->outputs() == std::vector<Value*>({x1}));
+  AT_ASSERT(b->inputs() == std::vector<Value*>({x1}));
+  AT_ASSERT(b->outputs() == std::vector<Value*>({x2}));
+}
+
+TEST(IRParserTest, Attributes) {
+  // Check that parser handles attributes and types.
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -176,155 +182,147 @@ graph(%0 : Tensor,
   %8 : string = z::z()
   return (%7)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, OptionalTypes) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : int? = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, StarTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Float(*, *, *) = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, UnshapedTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Long() = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, ShapedTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Double(4, 4, 5) = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, NestedContrainer) {
+  checkRoundtrip(
+      R"IR(
 graph():
   %0 : float[] = prim::Constant[value=[1., 2., 3.]]()
   %1 : str[] = prim::Constant[value=["ab", "cd", "ef"]]()
   %2 : (float[], str[]) = prim::TupleConstruct(%0, %1)
   return (%2)
 )IR");
-  }
+}
 
-  {
-    bool error_thrown = false;
-    try {
-      checkRoundtrip(
-          R"IR(
+TEST(IRParserTest, MalformedShapeAnnotation) {
+  EXPECT_ANY_THROW(checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
     %1 : Tensor,
     %2 : Tensor):
   %3 : Double(4!, 4, 5) = prim::Constant()
   return (%3)
-)IR");
-    } catch (const std::exception& error) {
-      error_thrown = true;
-    }
-    AT_ASSERT(error_thrown);
-  }
+)IR"));
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    const std::string& text =
-        R"IR(
+TEST(IRParserTest, FileCheck) {
+  auto graph = std::make_shared<Graph>();
+  const std::string& text =
+      R"IR(
     graph(%a):
     # CHECK: return
       return (%a))IR";
 
-    parseIR(text, &*graph);
-    AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
-    torch::jit::testing::FileCheck().run(text, *graph);
-  }
+  parseIR(text, &*graph);
+  AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
+  torch::jit::testing::FileCheck().run(text, *graph);
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
+TEST(IRParserTest, Strides) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(
+      R"IR(
 graph(%a : Float(4, 5),
       %b : Float(4:5, 5:1),
       %c : Double(*, *)):
   return (%a)
 )IR",
-        &*graph,
-        vmap);
-    Value* a = graph->inputs()[0];
-    Value* b = graph->inputs()[1];
-    Value* c = graph->inputs()[2];
+      &*graph,
+      vmap);
+  Value* a = graph->inputs()[0];
+  Value* b = graph->inputs()[1];
+  Value* c = graph->inputs()[2];
 
-    auto a_type = a->type()->cast<TensorType>();
-    auto a_sizes = *a_type->sizes().concrete_sizes();
-    auto a_strides = a_type->strides().concrete_sizes();
-    AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5);
-    AT_ASSERT(a_strides == c10::nullopt);
+  auto a_type = a->type()->cast<TensorType>();
+  auto a_sizes = *a_type->sizes().concrete_sizes();
+  auto a_strides = a_type->strides().concrete_sizes();
+  AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5);
+  AT_ASSERT(a_strides == c10::nullopt);
 
-    auto b_type = b->type()->cast<TensorType>();
-    auto b_sizes = *b_type->sizes().concrete_sizes();
-    auto b_strides = *(b_type->strides().sizes());
-    AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5);
-    AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1);
+  auto b_type = b->type()->cast<TensorType>();
+  auto b_sizes = *b_type->sizes().concrete_sizes();
+  auto b_strides = *(b_type->strides().sizes());
+  AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5);
+  AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1);
 
-    auto c_type = c->type()->cast<TensorType>();
-    AT_ASSERT(*c_type->sizes().size() == 2);
-    AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt);
-    AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    bool error_thrown = false;
-    try {
-      parseIR(
-          R"IR(
+  auto c_type = c->type()->cast<TensorType>();
+  AT_ASSERT(*c_type->sizes().size() == 2);
+  AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt);
+  AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt);
+}
+
+TEST(IRParserTest, MalformedStrides) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  bool error_thrown = false;
+  EXPECT_ANY_THROW(parseIR(
+      R"IR(
 graph(%a : Float(4:5, 5)):
   return (%a)
 )IR",
-          &*graph,
-          vmap);
-    } catch (const std::exception& error) {
-      error_thrown = true;
-    }
-    AT_ASSERT(error_thrown);
-  }
-  {
-    checkRoundtrip(
-        R"IR(
+      &*graph,
+      vmap));
+}
+
+TEST(IRParserTest, TensorShapes) {
+  checkRoundtrip(
+      R"IR(
 graph(%a : Float(4, 5),
       %b : Float(4:5, 5:1),
       %c : Double(*, *)):
   return (%a)
 )IR");
-  }
-  {
-    checkRoundtrip(
-        R"IR(
+}
+
+TEST(IRParserTest, DeviceAndRequiresGradTensors) {
+  checkRoundtrip(
+      R"IR(
 graph(%a : Float(*, *, device=cpu),
       %b : Float(*, *, requires_grad=1),
       %c : Long(5, 10, requires_grad=1, device=cpu),
@@ -337,41 +335,45 @@ graph(%a : Float(*, *, device=cpu),
       %j : Double(*, *, requires_grad=0)):
   return (%a)
 )IR");
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+}
+
+TEST(IRParserTest, ListConstant) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %d : int[] = prim::Constant[value=[1,2,3]]()
   return (%d)
 )IR",
-        &*graph);
-    Node* n = graph->outputs()[0]->node();
-    AT_ASSERT(n->kind() == prim::Constant);
-    AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival);
-    const auto& genericList = n->ival(attr::value).toList();
-    std::vector<int> int_vals;
-    for (const IValue& ival : genericList) {
-      int_vals.push_back(ival.toInt());
-    }
-    AT_ASSERT(int_vals.size() == 3);
-    AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3);
+      &*graph);
+  Node* n = graph->outputs()[0]->node();
+  AT_ASSERT(n->kind() == prim::Constant);
+  AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival);
+  const auto& genericList = n->ival(attr::value).toList();
+  std::vector<int> int_vals;
+  for (const IValue& ival : genericList) {
+    int_vals.push_back(ival.toInt());
   }
-  {
-    checkRoundtrip(
-        R"IR(
+  AT_ASSERT(int_vals.size() == 3);
+  AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3);
+}
+
+TEST(IRParserTest, PartialStarTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%x : Float(10, *, 10)):
   return (%x)
 )IR");
-    checkRoundtrip(
-        R"IR(
+}
+
+TEST(IRParserTest, ComplexTensorAttributes) {
+  checkRoundtrip(
+      R"IR(
 graph(%x : Double(*, 200, *, requires_grad=1, device=cuda:1),
       %b : Float(5, *, requires_grad=1),
       %c : Long(*, 10, device=cpu)):
   return (%x)
 )IR");
-  }
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_jit_type.cpp b/test/cpp/jit/test_jit_type.cpp
index 16c69ccd05fd..9462a572ea65 100644
--- a/test/cpp/jit/test_jit_type.cpp
+++ b/test/cpp/jit/test_jit_type.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include "torch/csrc/jit/ir/ir.h"
@@ -7,7 +8,7 @@
 namespace torch {
 namespace jit {
 
-void testUnifyTypes() {
+TEST(JitTypeTest, UnifyTypes) {
   auto bool_tensor = TensorType::get()->withScalarType(at::kBool);
   auto opt_bool_tensor = OptionalType::create(bool_tensor);
   auto unified_opt_bool = unifyTypes(bool_tensor, opt_bool_tensor);
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index d09048413aec..b262075a42aa 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -1,5 +1,6 @@
+#include <gtest/gtest.h>
+
 #include <c10/core/TensorOptions.h>
-#include <test/cpp/jit/test_base.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/mobile/import.h>
@@ -10,11 +11,19 @@
 
 #include <unordered_set>
 
+#define ASSERT_THROWS_WITH(statement, substring)                         \
+  try {                                                                  \
+    (void)statement;                                                     \
+    ASSERT_TRUE(false);                                                  \
+  } catch (const std::exception& e) {                                    \
+    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
+  }
+
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
 
-void testLiteInterpreterUpsampleNearest2d() {
+TEST(LiteInterpreterTest, UpsampleNearest2d) {
   Module m("m");
   m.define(R"(
     def forward(self, input: Tensor, scale:float):
@@ -37,7 +46,7 @@ void testLiteInterpreterUpsampleNearest2d() {
   ASSERT_TRUE(resd.equal(refd));
 }
 
-void testLiteInterpreterAdd() {
+TEST(LiteInterpreterTest, Add) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   // TODO: support default param val, which was pushed in
@@ -71,7 +80,7 @@ void testLiteInterpreterAdd() {
   AT_ASSERT(resd == refd);
 }
 
-void testLiteInterpreterConv() {
+TEST(LiteInterpreterTest, Conv) {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
   if (s && strcmp(s, "1") == 0)
     return;
@@ -103,7 +112,7 @@ void testLiteInterpreterConv() {
       outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
 }
 
-void testLiteInterpreterInline() {
+TEST(LiteInterpreterTest, Inline) {
   Module m("m");
   m.define(R"JIT(
   def foo1(self, x):
@@ -123,7 +132,7 @@ void testLiteInterpreterInline() {
   AT_ASSERT(output.toTensor().item<float>() == 7.0);
 }
 
-void testLiteInterpreterTuple() {
+TEST(LiteInterpreterTest, Tuple) {
   Module m("m");
   m.define(R"JIT(
   def foo(self, x):
@@ -141,7 +150,7 @@ void testLiteInterpreterTuple() {
   AT_ASSERT(output.toTuple()->elements()[1].toInt() == 2);
 }
 
-void testLiteInterpreterDict() {
+TEST(LiteInterpreterTest, Dict) {
   Module m("m");
   m.define(R"JIT(
   def foo(self, x):
@@ -159,7 +168,7 @@ void testLiteInterpreterDict() {
   AT_ASSERT(output.toGenericDict().at("result").toTensor().item().toInt() == 2);
 }
 
-void testLiteInterpreterPrimOverload() {
+TEST(LiteInterpreterTest, PrimOverload) {
   /*
   // temporarily disabled
   script::Module m("m");
@@ -178,7 +187,7 @@ void testLiteInterpreterPrimOverload() {
   */
 }
 
-void testLiteInterpreterPrim() {
+TEST(LiteInterpreterTest, Prim) {
   Module m("m");
   m.define(R"JIT(
         def forward(self, x):
@@ -204,7 +213,33 @@ void testLiteInterpreterPrim() {
   AT_ASSERT(resi == refi);
 }
 
-void testLiteInterpreterLoadOrigJit() {
+TEST(LiteInterpreterTest, PrimScalar) {
+  Module m("m");
+  m.define(R"JIT(
+        def forward(self, x):
+            return int(x.item())
+  )JIT");
+
+  std::vector<IValue> inputs;
+  auto minput = 3.5 * torch::ones({});
+  inputs.emplace_back(minput);
+  auto ref = m.run_method("forward", minput);
+
+  std::stringstream ss;
+  m._save_for_mobile(ss);
+  mobile::Module bc = _load_for_mobile(ss);
+  IValue res;
+  for (int i = 0; i < 3; ++i) {
+    auto bcinputs = inputs;
+    res = bc.get_method("forward")(bcinputs);
+  }
+
+  auto resi = res.toInt();
+  auto refi = ref.toInt();
+  AT_ASSERT(resi == refi);
+}
+
+TEST(LiteInterpreterTest, LoadOrigJit) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -217,7 +252,7 @@ void testLiteInterpreterLoadOrigJit() {
   ASSERT_THROWS_WITH(_load_for_mobile(ss), "file not found");
 }
 
-void testLiteInterpreterWrongMethodName() {
+TEST(LiteInterpreterTest, WrongMethodName) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -234,7 +269,7 @@ void testLiteInterpreterWrongMethodName() {
   ASSERT_THROWS_WITH(bc.get_method("forward")(inputs), "is not defined");
 }
 
-void testLiteInterpreterSetState() {
+TEST(LiteInterpreterTest, SetState) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -282,7 +317,7 @@ class TorchBindLiteInterpreterTestStruct
   }
 };
 
-void testLiteInterpreterBuiltinFunction() {
+TEST(LiteInterpreterTest, BuiltinFunction) {
   script::Module m("m");
   auto custom_class_obj =
       make_custom_class<TorchBindLiteInterpreterTestStruct>();
@@ -302,7 +337,7 @@ void testLiteInterpreterBuiltinFunction() {
   AT_ASSERT(str == expected);
 }
 
-void testLiteInterpreterModuleInfoBasic() {
+TEST(LiteInterpreterTest, ModuleInfoBasic) {
   Module m("M");
   m.define(R"JIT(
     def forward(self, x):
@@ -331,7 +366,7 @@ void testLiteInterpreterModuleInfoBasic() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterNotSavingModuleInfo() {
+TEST(LiteInterpreterTest, NotSaveModuleInfo) {
   Module m("M");
   m.define(R"JIT(
     def forward(self, x):
@@ -354,7 +389,7 @@ void testLiteInterpreterNotSavingModuleInfo() {
   }
 }
 
-void testLiteInterpreterOneSubmoduleModuleInfo() {
+TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -390,7 +425,7 @@ void testLiteInterpreterOneSubmoduleModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterTwoSubmodulesModuleInfo() {
+TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -432,7 +467,7 @@ void testLiteInterpreterTwoSubmodulesModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterSequentialModuleInfo() {
+TEST(LiteInterpreterTest, SequentialModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -474,7 +509,7 @@ void testLiteInterpreterSequentialModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterHierarchyModuleInfo() {
+TEST(LiteInterpreterTest, HierarchyModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -520,7 +555,7 @@ void testLiteInterpreterHierarchyModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterDuplicatedClassTypeModuleInfo() {
+TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -560,7 +595,7 @@ void testLiteInterpreterDuplicatedClassTypeModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterEval() {
+TEST(LiteInterpreterTest, Eval) {
   std::vector<torch::jit::IValue> inputs;
 
   Module m("m");
@@ -593,7 +628,7 @@ void testLiteInterpreterEval() {
       outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
 }
 
-void testLiteInterpreterFindWrongMethodName() {
+TEST(LiteInterpreterTest, FindWrongMethodName) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -607,7 +642,7 @@ void testLiteInterpreterFindWrongMethodName() {
   ASSERT_TRUE(bc.find_method("forward") == c10::nullopt);
 }
 
-void testLiteInterpreterFindAndRunMethod() {
+TEST(LiteInterpreterTest, FindAndRunMethod) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -637,7 +672,7 @@ void testLiteInterpreterFindAndRunMethod() {
   AT_ASSERT(resd == refd);
 }
 
-void testLiteInterpreterRunMethodVariadic() {
+TEST(LiteInterpreterTest, RunMethodVariadic) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp
index b70c4db62c70..9a988ecb2db1 100644
--- a/test/cpp/jit/test_lite_trainer.cpp
+++ b/test/cpp/jit/test_lite_trainer.cpp
@@ -1,5 +1,6 @@
+#include <gtest/gtest.h>
+
 #include <c10/core/TensorOptions.h>
-#include <test/cpp/jit/test_base.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/mobile/export_data.h>
@@ -16,7 +17,7 @@
 namespace torch {
 namespace jit {
 
-void testLiteInterpreterParams() {
+TEST(LiteTrainerTest, Params) {
   Module m("m");
   m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false);
   m.define(R"(
@@ -74,7 +75,7 @@ void testLiteInterpreterParams() {
   AT_ASSERT(parameters[0].item<float>() == bc_parameters[0].item<float>());
 }
 
-void testMobileNamedParameters() {
+TEST(MobileTest, NamedParameters) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -99,7 +100,7 @@ void testMobileNamedParameters() {
   }
 }
 
-void testMobileSaveLoadData() {
+TEST(MobileTest, SaveLoadData) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -127,7 +128,7 @@ void testMobileSaveLoadData() {
   }
 }
 
-void testMobileSaveLoadParameters() {
+TEST(MobileTest, SaveLoadParameters) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -157,7 +158,7 @@ void testMobileSaveLoadParameters() {
   }
 }
 
-void testMobileSaveLoadParametersEmpty() {
+TEST(MobileTest, SaveLoadParametersEmpty) {
   Module m("m");
   m.define(R"(
     def add_it(self, x):
@@ -180,7 +181,7 @@ void testMobileSaveLoadParametersEmpty() {
   AT_ASSERT(mobile_params.size() == 0);
 }
 
-void testLiteSGD() {
+TEST(LiteTrainerTest, SGD) {
   Module m("m");
   m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false);
   m.define(R"(
@@ -253,7 +254,7 @@ struct DummyDataset : torch::data::datasets::Dataset<DummyDataset, int> {
 };
 } // namespace
 
-void testLiteSequentialSampler() {
+TEST(LiteTrainerTest, SequentialSampler) {
   // test that sampler can be used with dataloader
   const int kBatchSize = 10;
   auto data_loader =
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 4941c11d6cae..d205ae3d58db 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1,9 +1,10 @@
+#include <gtest/gtest.h>
+
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
 
-#include "test/cpp/jit/test_base.h"
 #include "test/cpp/jit/test_utils.h"
 
 #include <torch/csrc/jit/ir/type_hashing.h>
@@ -92,7 +93,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& list) {
   return out;
 }
 
-void testInternedStrings() {
+TEST(InternedStringsTest, Basic) {
   ASSERT_EQ(prim::Param, Symbol::prim("Param"));
   ASSERT_EQ(prim::Return, Symbol::prim("Return"));
   ASSERT_EQ(prim::Return.toUnqualString(), std::string("Return"));
@@ -108,7 +109,7 @@ void testInternedStrings() {
   ASSERT_EQ(Symbol(symstart + 2).toUnqualString(), std::string("What2"));
 }
 
-void testFromQualString() {
+TEST(FromQualStringTest, Basic) {
   ASSERT_EQ(Symbol::fromQualString("prim::Param"), Symbol::prim("Param"));
   ASSERT_EQ(Symbol::fromQualString("aten::mm"), Symbol::aten("mm"));
   ASSERT_EQ(Symbol::fromQualString("onnx::LSTM"), Symbol::onnx("LSTM"));
@@ -138,7 +139,7 @@ void testFromQualString() {
   }
 }
 
-void testTHNNConv() {
+TEST(THNNConvTest, Basic) {
   std::vector<int64_t> input_size = {4, 3, 15, 17}; // B x C x H x W
   std::vector<int64_t> kernel_size = {3, 5};
   std::vector<int64_t> stride = {1, 2};
@@ -233,7 +234,7 @@ void testTHNNConv() {
   assertAllClose(tensor_grads_out, expected_tensor_grads_out);
 }
 
-void testATenNativeBatchNorm() {
+TEST(ATenNativeBatchNormTest, Basic) {
   // aten::native_batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor
   // running_mean, Tensor running_var, bool training, float momentum, float eps)
   // -> (Tensor, Tensor, Tensor)
@@ -365,7 +366,7 @@ void testATenNativeBatchNorm() {
   assertAllClose(tensor_grads_out, expected_tensor_grads_out);
 }
 
-void testCustomFusion() {
+TEST(CustomFusionTest, Basic) {
   auto graph_string = R"IR(
     graph(%0 : Float(2, 3, 4),
           %1 : Float(2, 3, 4)):
@@ -399,7 +400,7 @@ void testCustomFusion() {
   AT_ASSERT(hits == 2);
 }
 
-void testCustomFusionNestedBlocks() {
+TEST(CustomFusionTest, NestedBlocks) {
   auto graph_string = R"IR(
   graph(%0 : Float(2, 3, 4),
         %1 : Float(2, 3, 4),
@@ -461,7 +462,8 @@ static const auto cf_examples = R"JIT(
       i += 1
     return a
 )JIT";
-void testControlFlow() {
+
+TEST(ControlFlowTest, Basic) {
   auto cu = compile(cf_examples);
 
   auto run = [&](const std::string& name, std::vector<IValue> stack) {
@@ -484,13 +486,13 @@ void testControlFlow() {
   ASSERT_EQ(256, run_binary("while_test", 2, 0));
 }
 
-void testProto() {
+TEST(ProtoTest, Basic) {
   ::ONNX_NAMESPACE::ModelProto proto;
   proto.set_producer_name("foo");
 }
 
 // test a few features that are not directly used in schemas yet
-void testSchemaParser() {
+TEST(SchemaParserTest, NestedArrays) {
   // nested arrays
   auto s = parseSchema("at::what(int[][4] foo) -> ()");
   ASSERT_TRUE(s.arguments().at(0).N() == 4);
@@ -509,145 +511,151 @@ void testSchemaParser() {
                                               ->getElementType()
                                               ->expect<ListType>()
                                               ->getElementType()));
+}
 
+TEST(SchemaParserTest, NamedReturns) {
   // named returns
   parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
   auto s3 =
       parseSchema("at::what() -> (Tensor the_return, Tensor the_return2)");
   ASSERT_TRUE(s3.returns().at(0).name() == "the_return");
   ASSERT_TRUE(s3.returns().at(1).name() == "the_return2");
+}
 
+TEST(SchemaParserTest, Futures) {
   // futures
   auto s4 = parseSchema("at::what(Future(int) foo) -> ()");
   ASSERT_TRUE(IntType::get()->isSubtypeOf(
       s4.arguments().at(0).type()->expect<FutureType>()->getElementType()));
+}
 
+TEST(SchemaParserTest, AnnotatedAliasSets) {
   // test tensor with annotated alias sets
   parseSchema("at::what(Tensor(a) foo) -> (Tensor(a))");
+}
 
-  {
-    const auto s = parseSchema(
-        "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)"
-        " -> (Tensor(b|c)[](a!))");
-
-    // The list itself is annotated with `a`
-    const auto& aliasInfo = *s.arguments().at(0).alias_info();
-    ASSERT_TRUE(
-        aliasInfo.beforeSets() ==
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_TRUE(aliasInfo.isWrite());
-
-    // Check the contained types
-    ASSERT_TRUE(!aliasInfo.containedTypes().empty());
-    const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
-    const auto expected = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"),
-        Symbol::fromQualString("alias::c"),
-    };
-    ASSERT_TRUE(containedAliasInfo.beforeSets() == expected);
-    ASSERT_TRUE(containedAliasInfo.afterSets() == expected);
-    ASSERT_FALSE(containedAliasInfo.isWrite());
-  }
-  {
-    const auto s = parseSchema(
-        "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)"
-        " -> (Tensor(b|c)[](a!))");
-
-    // The list itself is annotated with `a`
-    const auto& aliasInfo = *s.arguments().at(0).alias_info();
-    ASSERT_EQ(
-        aliasInfo.beforeSets(),
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_EQ(
-        aliasInfo.afterSets(),
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_TRUE(aliasInfo.isWrite());
-    ASSERT_EQ(aliasInfo.containedTypes().size(), 1);
-
-    // Check the contained types
-    ASSERT_TRUE(!aliasInfo.containedTypes().empty());
-    const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
-    const auto expectedBefore = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"),
-    };
-    const auto expectedAfter = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")};
-    ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore);
-    ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter);
-    ASSERT_FALSE(containedAliasInfo.isWrite());
-  }
+TEST(SchemaParserTest, BeforeAfterSets) {
+  const auto s = parseSchema(
+      "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)"
+      " -> (Tensor(b|c)[](a!))");
+
+  // The list itself is annotated with `a`
+  const auto& aliasInfo = *s.arguments().at(0).alias_info();
+  ASSERT_TRUE(
+      aliasInfo.beforeSets() ==
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_TRUE(aliasInfo.isWrite());
+
+  // Check the contained types
+  ASSERT_TRUE(!aliasInfo.containedTypes().empty());
+  const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
+  const auto expected = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"),
+      Symbol::fromQualString("alias::c"),
+  };
+  ASSERT_TRUE(containedAliasInfo.beforeSets() == expected);
+  ASSERT_TRUE(containedAliasInfo.afterSets() == expected);
+  ASSERT_FALSE(containedAliasInfo.isWrite());
 }
 
-void testTopologicalIndex() {
-  {
-    Graph graph;
-    auto node1 = graph.create(prim::AutogradZero);
-    auto node2 = graph.create(prim::AutogradZero);
-    auto node3 = graph.create(prim::AutogradZero);
-    auto node4 = graph.create(prim::AutogradZero);
-
-    graph.appendNode(node4);
-    graph.prependNode(node1);
-    node2->insertAfter(node1);
-    node3->insertBefore(node4);
-
-    // nodes should be in numerical order
-    ASSERT_TRUE(node1->isBefore(node2));
-    ASSERT_TRUE(node1->isBefore(node3));
-    ASSERT_TRUE(node1->isBefore(node4));
-    ASSERT_TRUE(node2->isAfter(node1));
-    ASSERT_TRUE(node2->isBefore(node3));
-    ASSERT_TRUE(node2->isBefore(node4));
-    ASSERT_FALSE(node3->isBefore(node1));
-    ASSERT_FALSE(node3->isBefore(node2));
-    ASSERT_FALSE(node3->isAfter(node4));
-
-    // Built up a block structure
-    //  node3
-    //   /\        ...
-    //  A  B     block1
-    //      \      ...
-    //      C    block2
-    auto block1 = node3->addBlock();
-    auto A = graph.create(prim::AutogradZero);
-    block1->appendNode(A);
-    auto B = graph.create(prim::AutogradZero);
-    block1->appendNode(B);
-    auto block2 = B->addBlock();
-    auto C = graph.create(prim::AutogradZero);
-    block2->appendNode(C);
-
-    // Check isAfter on different block levels
-    ASSERT_TRUE(node1->isBefore(A));
-    ASSERT_TRUE(A->isBefore(B));
-    ASSERT_TRUE(A->isBefore(C));
-
-    // make sure things don't blow up on deletions
-    node2->destroy();
-    auto node2p = graph.create(prim::AutogradZero);
-    node2p->insertAfter(node1);
-    ASSERT_TRUE(node1->isBefore(node2p));
-    ASSERT_TRUE(node2p->isBefore(node3));
+TEST(SchemaParserTest, BeforeAfterSets2) {
+  const auto s = parseSchema(
+      "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)"
+      " -> (Tensor(b|c)[](a!))");
+
+  // The list itself is annotated with `a`
+  const auto& aliasInfo = *s.arguments().at(0).alias_info();
+  ASSERT_EQ(
+      aliasInfo.beforeSets(),
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_EQ(
+      aliasInfo.afterSets(),
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_TRUE(aliasInfo.isWrite());
+  ASSERT_EQ(aliasInfo.containedTypes().size(), 1);
+
+  // Check the contained types
+  ASSERT_TRUE(!aliasInfo.containedTypes().empty());
+  const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
+  const auto expectedBefore = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"),
+  };
+  const auto expectedAfter = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")};
+  ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore);
+  ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter);
+  ASSERT_FALSE(containedAliasInfo.isWrite());
+}
+
+TEST(TopologicalIndexTest, Basic) {
+  Graph graph;
+  auto node1 = graph.create(prim::AutogradZero);
+  auto node2 = graph.create(prim::AutogradZero);
+  auto node3 = graph.create(prim::AutogradZero);
+  auto node4 = graph.create(prim::AutogradZero);
+
+  graph.appendNode(node4);
+  graph.prependNode(node1);
+  node2->insertAfter(node1);
+  node3->insertBefore(node4);
+
+  // nodes should be in numerical order
+  ASSERT_TRUE(node1->isBefore(node2));
+  ASSERT_TRUE(node1->isBefore(node3));
+  ASSERT_TRUE(node1->isBefore(node4));
+  ASSERT_TRUE(node2->isAfter(node1));
+  ASSERT_TRUE(node2->isBefore(node3));
+  ASSERT_TRUE(node2->isBefore(node4));
+  ASSERT_FALSE(node3->isBefore(node1));
+  ASSERT_FALSE(node3->isBefore(node2));
+  ASSERT_FALSE(node3->isAfter(node4));
+
+  // Built up a block structure
+  //  node3
+  //   /\        ...
+  //  A  B     block1
+  //      \      ...
+  //      C    block2
+  auto block1 = node3->addBlock();
+  auto A = graph.create(prim::AutogradZero);
+  block1->appendNode(A);
+  auto B = graph.create(prim::AutogradZero);
+  block1->appendNode(B);
+  auto block2 = B->addBlock();
+  auto C = graph.create(prim::AutogradZero);
+  block2->appendNode(C);
+
+  // Check isAfter on different block levels
+  ASSERT_TRUE(node1->isBefore(A));
+  ASSERT_TRUE(A->isBefore(B));
+  ASSERT_TRUE(A->isBefore(C));
+
+  // make sure things don't blow up on deletions
+  node2->destroy();
+  auto node2p = graph.create(prim::AutogradZero);
+  node2p->insertAfter(node1);
+  ASSERT_TRUE(node1->isBefore(node2p));
+  ASSERT_TRUE(node2p->isBefore(node3));
+}
+
+TEST(TopologicalIndexTest, Reindex) {
+  // Induce reindexing to test that path
+  Graph graph;
+  std::map<size_t, Node*> nodes;
+
+  auto anchor = graph.create(prim::AutogradZero);
+  graph.appendNode(anchor);
+  // Inserting to the same place a lot will trigger reindexing
+  for (auto i = 0; i < 100; ++i) {
+    auto n = graph.create(prim::AutogradZero);
+    n->insertAfter(anchor);
+    nodes[i] = n;
   }
-  {
-    // Induce reindexing to test that path
-    Graph graph;
-    std::map<size_t, Node*> nodes;
-
-    auto anchor = graph.create(prim::AutogradZero);
-    graph.appendNode(anchor);
-    // Inserting to the same place a lot will trigger reindexing
-    for (auto i = 0; i < 100; ++i) {
-      auto n = graph.create(prim::AutogradZero);
-      n->insertAfter(anchor);
-      nodes[i] = n;
-    }
 
-    // Nodes should be in reverse order
-    for (auto i = 0; i < 100; ++i) {
-      for (auto j = i + 1; j < 100; ++j) {
-        ASSERT_TRUE(nodes[i]->isAfter(nodes[j]));
-      }
+  // Nodes should be in reverse order
+  for (auto i = 0; i < 100; ++i) {
+    for (auto j = i + 1; j < 100; ++j) {
+      ASSERT_TRUE(nodes[i]->isAfter(nodes[j]));
     }
   }
 }
@@ -770,7 +778,7 @@ void checkScopeCallbacks() {
   TORCH_CHECK(found_user_scope);
 }
 
-void testRecordFunction() {
+TEST(RecordFunctionTest, Basic) {
   // disabling the inlining of method calls
   GraphOptimizerEnabledGuard opt_guard(false);
 
@@ -817,7 +825,6 @@ void testRecordFunction() {
     traced_inputs.clear();
   }
 
-  TORCH_CHECK(ts_names.size() == 2);
   TORCH_CHECK(ts_names.find("forward") != ts_names.end());
   TORCH_CHECK(ts_names.find("foo") != ts_names.end());
 
@@ -1136,7 +1143,7 @@ void checkDebugInfo(c10::DebugInfoKind kind, int model_id) {
   TORCH_CHECK(test_debug_info->getModelId() == model_id);
 }
 
-void testThreadLocalDebugInfo() {
+TEST(ThreadLocalDebugInfoTest, Basic) {
   TORCH_CHECK(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
   auto debug_info = std::make_shared<TestThreadLocalDebugInfo>();
@@ -1209,7 +1216,7 @@ void testThreadLocalDebugInfo() {
   }
 }
 
-void testFallbackGraphs() {
+TEST(FallbackGraphsTest, Basic) {
   static const auto nestGraphIntoFallbackGraph =
       [](const std::shared_ptr<Graph>& graph) {
         ProfilingRecord::removeProfileCounter(graph->block());
@@ -1285,35 +1292,36 @@ void testFallbackGraphs() {
   }
 }
 
-void testAutogradProfiler() {
-  constexpr int batch_size = 4;
-  constexpr int input_size = 256;
-  constexpr int seq_len = 32;
-
-  int hidden_size = 2 * input_size;
-  auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU);
-  auto hx = torch::randn({batch_size, hidden_size}, at::kCPU);
-  auto cx = torch::randn({batch_size, hidden_size}, at::kCPU);
-  auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU));
-  auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU));
-
-  std::stringstream ss;
-  {
-    RecordProfile guard(ss);
-    for (size_t i = 0; i < 100; ++i) {
-      std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
-    }
-  }
-
-  std::string result = ss.str();
-  size_t count = 0;
-  for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
-       count++, pos++) {
-  }
-  TORCH_CHECK(count == 200);
-}
-
-void testNoneSchemaMatch() {
+// TODO this test wasn't running and is broken.
+// TEST(AutogradProfilerTest, Basic) {
+//   constexpr int batch_size = 4;
+//   constexpr int input_size = 256;
+//   constexpr int seq_len = 32;
+
+//   int hidden_size = 2 * input_size;
+//   auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU);
+//   auto hx = torch::randn({batch_size, hidden_size}, at::kCPU);
+//   auto cx = torch::randn({batch_size, hidden_size}, at::kCPU);
+//   auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU));
+//   auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU));
+
+//   std::stringstream ss;
+//   {
+//     RecordProfile guard(ss);
+//     for (size_t i = 0; i < 100; ++i) {
+//       std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
+//     }
+//   }
+
+//   std::string result = ss.str();
+//   size_t count = 0;
+//   for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
+//        count++, pos++) {
+//   }
+//   ASSERT_EQ((count, 200);
+// }
+
+TEST(NoneSchemaMatchTest, Basic) {
   RegisterOperators reg({
       Operator(
           "prim::test_none() -> int?",
@@ -1348,40 +1356,6 @@ void testNoneSchemaMatch() {
   AT_ASSERT(std::distance(nodes.begin(), nodes.end()) == 1);
 }
 
-void testModuleDefine() {
-  Module m("m");
-  m.register_parameter("foo", torch::ones({}), false);
-  m.define(R"(
-    def add_it(self, x, b : int = 4):
-      return self.foo + x + b
-  )");
-  auto result = m.run_method("add_it", torch::ones({}));
-  AT_ASSERT(result.toTensor().item<float>() == 6);
-}
-
-void testModuleConversion() {
-  Module m("test");
-  {
-    // test cuda to cpu for params and buffers
-    m.register_parameter("foo", torch::ones({}, at::kCUDA), false);
-    m.register_buffer("bar", torch::ones({}, at::kCUDA));
-
-    m.to(at::kCUDA);
-    m.to(at::kCPU);
-    AT_ASSERT(m.attr("foo").toTensor().device().is_cpu());
-    AT_ASSERT(m.attr("bar").toTensor().device().is_cpu());
-  }
-  {
-    // test cpu to cuda for params and buffers
-    m.register_parameter("foo", torch::ones({}), false);
-    m.register_buffer("bar", torch::ones({}));
-
-    m.to(at::kCUDA);
-    AT_ASSERT(m.attr("foo").toTensor().device().is_cuda());
-    AT_ASSERT(m.attr("bar").toTensor().device().is_cuda());
-  }
-}
-
 static int testPassValue = 0;
 void fakePass(std::shared_ptr<Graph>& g) {
   testPassValue++;
@@ -1390,7 +1364,7 @@ void fakePass(std::shared_ptr<Graph>& g) {
 
 RegisterPass p(fakePass);
 
-void testPassManagement() {
+TEST(PassManagementTest, Basic) {
   std::shared_ptr<Graph> graph = std::make_shared<Graph>();
   parseIR(
       R"IR(
@@ -1447,14 +1421,17 @@ size_t countNodes(
   return count;
 }
 
-void testLoopPeeler() {
-  // peel all loops
-  auto true_pred = [](Node* n) { return true; };
-  auto is_loop = [](Node* n) { return n->kind() == prim::Loop; };
+bool true_pred(Node* n) {
+  return true;
+};
+
+bool is_loop(Node* n) {
+  return n->kind() == prim::Loop;
+};
 
+TEST(LoopPeelerTest, NoInductionVariableUse) {
   // do not use an induction variable explicitly
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_peel_n_times():
       sum = 0
       for i in range(10):
@@ -1462,41 +1439,41 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_peel_n_times");
-    auto stack = createStack({});
-    // peeling loop once
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 20);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_peel_n_times");
+  auto stack = createStack({});
+  // peeling loop once
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 20);
+  }
 
-    // test peeling more than one iteration
-    {
-      LoopsPeeler peeler(true_pred, 3);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 20);
-    }
+  // test peeling more than one iteration
+  {
+    LoopsPeeler peeler(true_pred, 3);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 20);
   }
+}
 
+TEST(LoopPeelerTest, YesInductionVariableUse) {
   // uses the induction variable
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_peel_n_times():
       sum = 0
       for i in range(10):
@@ -1504,41 +1481,41 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_peel_n_times");
-    auto stack = createStack({});
-    // peeling loop once
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 45);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_peel_n_times");
+  auto stack = createStack({});
+  // peeling loop once
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 45);
+  }
 
-    // test peeling more than one iteration
-    {
-      LoopsPeeler peeler(true_pred, 3);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 45);
-    }
+  // test peeling more than one iteration
+  {
+    LoopsPeeler peeler(true_pred, 3);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 45);
   }
+}
 
+TEST(LoopPeelerTest, LoopWithTerminationCondition) {
   // tests with explicit termination conditions
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_with_cond_times():
       sum = 0
       i = 0
@@ -1548,44 +1525,44 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    // the peel changes the termination condition to false
-    // so the original loop doesn't run
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_with_cond_times");
-    auto stack = createStack({});
-    // peeling 5 iterations should update the termination
-    // condition to false
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
-
-    // the termination condition remains true
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  // the peel changes the termination condition to false
+  // so the original loop doesn't run
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_with_cond_times");
+  auto stack = createStack({});
+  // peeling 5 iterations should update the termination
+  // condition to false
+  {
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
   }
 
-  // tests simple nested loops
+  // the termination condition remains true
   {
-    static const auto str_func_def = R"JIT(
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
+  }
+}
+
+// tests simple nested loops
+TEST(LoopPeelerTest, SimpleNestedLoops) {
+  static const auto str_func_def = R"JIT(
     def test_nested_loops():
       sum = 0
       i = 0
@@ -1595,35 +1572,35 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_nested_loops");
-    auto stack = createStack({});
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_nested_loops");
+  auto stack = createStack({});
 
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 900);
-    }
-
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 900);
-    }
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 900);
   }
 
   {
-    static const auto str_func_def = R"JIT(
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 900);
+  }
+}
+
+TEST(LoopPeelerTest, SimpleNestedLoops2) {
+  static const auto str_func_def = R"JIT(
     def test_nested_loops():
       sum = 0
       i = 0
@@ -1635,34 +1612,33 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_nested_loops");
-    auto stack = createStack({});
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_nested_loops");
+  auto stack = createStack({});
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
+  }
 
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  {
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
   }
 }
 
-void testInsertAndEliminateRedundantGuards() {
+TEST(InsertAndEliminateRedundantGuardsTest, Basic) {
   static const auto basic_example = R"JIT(
   def basic(x, y):
     a = x + y
@@ -1705,7 +1681,7 @@ void testInsertAndEliminateRedundantGuards() {
   ASSERT_EQ(num_guards, 2);
 }
 
-void testInsertBailOuts() {
+TEST(InsertBailOutsTest, Basic) {
   static const auto basic_example = R"JIT(
   def basic_loop(x, y):
 
@@ -1754,7 +1730,7 @@ void testInsertBailOuts() {
   }
 }
 
-void testProfiler() {
+TEST(ProfilerTest, Basic) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
 
@@ -1804,7 +1780,7 @@ void testProfiler() {
   checkShape(tanh_n->inputs().at(0)->node()->ty(attr::profiled_type), eltwise);
 }
 
-void testCallStack() {
+TEST(CallStackTest, Basic) {
   const auto text = R"(
 def ham(x):
     return x/7
@@ -1880,7 +1856,7 @@ def foo(x):
   }
 }
 
-void testCallStackCaching() {
+TEST(CallStackTest, Caching) {
   const auto text = R"(
 
 def a(x):
@@ -1923,7 +1899,7 @@ def c(x):
   ASSERT_TRUE(callstack_objects.at("a1") == callstack_objects.at("a2"));
 }
 
-void testAutogradSymbols() {
+TEST(AutogradSymbolsTest, Basic) {
   Symbol sym = Symbol::fromQualString("aten::test_symbol");
   Graph graph;
   auto node = graph.create(sym);
@@ -1942,7 +1918,7 @@ void testAutogradSymbols() {
   TORCH_CHECK(!canRunWithAutograd(node));
 }
 
-void testDefaultArgTypeHinting() {
+TEST(DefaultArgTypeHintingTest, Basic) {
   const auto text_non_hinted = R"(
 
 def a(x, y=1):
@@ -1968,184 +1944,182 @@ def a(x, y:int=1):
   auto cu = compile(text_hinted);
 }
 
-void testFutures() {
-  // Basic set case.
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    ASSERT_FALSE(f1->completed());
-    ASSERT_FALSE(f1->hasValue());
-    int32_t sat1 = 0;
-    int32_t sat2 = 0;
-    f1->addCallback([&]() { ++sat1; });
-    f1->markCompleted(43);
-    ASSERT_TRUE(f1->completed());
-    ASSERT_TRUE(f1->hasValue());
-    ASSERT_FALSE(f1->hasError());
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(f1->constValue().toInt(), 43);
-    ASSERT_EQ(f1->value().toInt(), 43);
-    f1->addCallback([&]() { ++sat2; });
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
-  }
+// Basic set case.
+TEST(FuturesTest, Basic) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  ASSERT_FALSE(f1->completed());
+  ASSERT_FALSE(f1->hasValue());
+  int32_t sat1 = 0;
+  int32_t sat2 = 0;
+  f1->addCallback([&]() { ++sat1; });
+  f1->markCompleted(43);
+  ASSERT_TRUE(f1->completed());
+  ASSERT_TRUE(f1->hasValue());
+  ASSERT_FALSE(f1->hasError());
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(f1->constValue().toInt(), 43);
+  ASSERT_EQ(f1->value().toInt(), 43);
+  f1->addCallback([&]() { ++sat2; });
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+}
 
-  // Basic error cases.
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    int sat1 = 0;
-    int sat2 = 0;
-    f1->addCallback([&]() { ++sat1; });
-    f1->setError(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
-    ASSERT_EQ(sat1, 1);
-    ASSERT_TRUE(f1->completed());
-    ASSERT_TRUE(f1->hasError());
-    ASSERT_FALSE(f1->hasValue());
-    try {
-      (void)f1->value();
-      ASSERT_TRUE(false); // Supposed to throw.
-    } catch (const std::exception& e) {
-      ASSERT_TRUE(strcmp(e.what(), "Failed") == 0);
-    }
-    f1->addCallback([&]() { ++sat2; });
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
-    f1->setErrorIfNeeded(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup")));
-    ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0);
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
+// Basic error cases.
+TEST(FuturesTest, Error) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  int sat1 = 0;
+  int sat2 = 0;
+  f1->addCallback([&]() { ++sat1; });
+  f1->setError(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
+  ASSERT_EQ(sat1, 1);
+  ASSERT_TRUE(f1->completed());
+  ASSERT_TRUE(f1->hasError());
+  ASSERT_FALSE(f1->hasValue());
+  try {
+    (void)f1->value();
+    ASSERT_TRUE(false); // Supposed to throw.
+  } catch (const std::exception& e) {
+    ASSERT_TRUE(strcmp(e.what(), "Failed") == 0);
   }
+  f1->addCallback([&]() { ++sat2; });
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+  f1->setErrorIfNeeded(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup")));
+  ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0);
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+}
 
-  // then
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    auto f2 = f1->then(
-        [f1]() -> IValue { return f1->constValue().toInt() + 1; },
-        IntType::get());
-    auto f3 = f2->then(
-        [f2]() -> IValue { return f2->constValue().toInt() * 3; },
-        IntType::get());
-    bool done = false;
-    f3->addCallback([f3, &done]() {
-      ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3);
-      done = true;
-    });
-    ASSERT_FALSE(done);
-    f1->markCompleted(42);
-    ASSERT_TRUE(done);
-  }
+// then
+TEST(FuturesTest, Then) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  auto f2 = f1->then(
+      [f1]() -> IValue { return f1->constValue().toInt() + 1; },
+      IntType::get());
+  auto f3 = f2->then(
+      [f2]() -> IValue { return f2->constValue().toInt() * 3; },
+      IntType::get());
+  bool done = false;
+  f3->addCallback([f3, &done]() {
+    ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3);
+    done = true;
+  });
+  ASSERT_FALSE(done);
+  f1->markCompleted(42);
+  ASSERT_TRUE(done);
+}
 
-  // collectAll()
-  {
-    auto s1 = c10::make_intrusive<Future>(IntType::get());
-    auto s2 = c10::make_intrusive<Future>(IntType::get());
-    auto s3 = c10::make_intrusive<Future>(IntType::get());
-
-    // Empty case
-    c10::List<intrusive_ptr<ivalue::Future>> futures(
-        FutureType::create(IntType::get()));
-    auto c1 = collectAll(futures);
-    ASSERT_TRUE(c1->completed());
-    ASSERT_EQ(c1->value().toList().size(), 0);
-    ASSERT_TRUE(
-        *(c1->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-
-    // 1-element, initially not completed.
-    futures.push_back(s1);
-    auto c2 = collectAll(futures);
-    ASSERT_FALSE(c2->completed());
-    s1->markCompleted(5);
-    ASSERT_TRUE(c2->completed());
-    ASSERT_EQ(c2->value().toList().size(), 1);
-    ASSERT_TRUE(
-        *(c2->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-    ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5);
-
-    // 1-element, already completed
-    auto c3 = collectAll(futures);
-    ASSERT_TRUE(c3->completed());
-    ASSERT_EQ(c3->value().toList().size(), 1);
-    ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5);
-
-    // 3 elements.
-    futures.push_back(s2);
-    futures.push_back(s3);
-    auto c4 = collectAll(futures);
-    ASSERT_FALSE(c4->completed());
-    s3->markCompleted(7);
-    ASSERT_FALSE(c4->completed());
-    s2->markCompleted(6);
-    ASSERT_TRUE(c4->completed());
-    ASSERT_EQ(c4->value().toList().size(), 3);
-    ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5);
-    ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6);
-    ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7);
-    ASSERT_TRUE(
-        *(c4->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-
-    // Handle exception in the list.
-    auto s4 = c10::make_intrusive<Future>(IntType::get());
-    futures.push_back(s4);
-    auto c5 = collectAll(futures);
-    ASSERT_FALSE(c5->completed());
-    s4->setError(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
-    ASSERT_TRUE(c5->completed());
-    ASSERT_EQ(c5->value().toList().size(), 4);
-    try {
-      (void)c5->value().toList().get(3).toFuture()->value();
-      ASSERT_TRUE(false); // supposed to throw
-    } catch (const std::exception& e) {
-      ASSERT_EQ(std::string(e.what()), "Failed");
-    }
+// collectAll()
+TEST(FuturesTest, CollectAll) {
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
+  auto s2 = c10::make_intrusive<Future>(IntType::get());
+  auto s3 = c10::make_intrusive<Future>(IntType::get());
+
+  // Empty case
+  c10::List<intrusive_ptr<ivalue::Future>> futures(
+      FutureType::create(IntType::get()));
+  auto c1 = collectAll(futures);
+  ASSERT_TRUE(c1->completed());
+  ASSERT_EQ(c1->value().toList().size(), 0);
+  ASSERT_TRUE(
+      *(c1->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+
+  // 1-element, initially not completed.
+  futures.push_back(s1);
+  auto c2 = collectAll(futures);
+  ASSERT_FALSE(c2->completed());
+  s1->markCompleted(5);
+  ASSERT_TRUE(c2->completed());
+  ASSERT_EQ(c2->value().toList().size(), 1);
+  ASSERT_TRUE(
+      *(c2->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+  ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5);
+
+  // 1-element, already completed
+  auto c3 = collectAll(futures);
+  ASSERT_TRUE(c3->completed());
+  ASSERT_EQ(c3->value().toList().size(), 1);
+  ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5);
+
+  // 3 elements.
+  futures.push_back(s2);
+  futures.push_back(s3);
+  auto c4 = collectAll(futures);
+  ASSERT_FALSE(c4->completed());
+  s3->markCompleted(7);
+  ASSERT_FALSE(c4->completed());
+  s2->markCompleted(6);
+  ASSERT_TRUE(c4->completed());
+  ASSERT_EQ(c4->value().toList().size(), 3);
+  ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5);
+  ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6);
+  ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7);
+  ASSERT_TRUE(
+      *(c4->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+
+  // Handle exception in the list.
+  auto s4 = c10::make_intrusive<Future>(IntType::get());
+  futures.push_back(s4);
+  auto c5 = collectAll(futures);
+  ASSERT_FALSE(c5->completed());
+  s4->setError(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
+  ASSERT_TRUE(c5->completed());
+  ASSERT_EQ(c5->value().toList().size(), 4);
+  try {
+    (void)c5->value().toList().get(3).toFuture()->value();
+    ASSERT_TRUE(false); // supposed to throw
+  } catch (const std::exception& e) {
+    ASSERT_EQ(std::string(e.what()), "Failed");
   }
+}
 
-  // collectAny()
-  {
-    auto s1 = c10::make_intrusive<Future>(IntType::get());
-
-    // Empty case
-    c10::List<intrusive_ptr<ivalue::Future>> futures(
-        FutureType::create(IntType::get()));
-    auto c1 = collectAny(futures);
-    ASSERT_TRUE(c1->completed());
-
-    // 1 element, not yet satisfied
-    futures.push_back(s1);
-    auto c2 = collectAny(futures);
-    ASSERT_FALSE(c2->completed());
-    s1->markCompleted(5);
-    ASSERT_TRUE(c2->completed());
-    ASSERT_TRUE(c2->value().isInt());
-    ASSERT_EQ(c2->value().toInt(), 5);
-
-    // 1 element already satisfied.
-    auto c3 = collectAny(futures);
-    ASSERT_TRUE(c3->completed());
-    ASSERT_TRUE(c3->value().isInt());
-    ASSERT_EQ(c3->value().toInt(), 5);
-
-    // 2 elements
-    futures.clear();
-    auto s2 = c10::make_intrusive<Future>(IntType::get());
-    auto s3 = c10::make_intrusive<Future>(IntType::get());
-    futures.push_back(s2);
-    futures.push_back(s3);
-    auto c4 = collectAny(futures);
-    ASSERT_FALSE(c4->completed());
-    s3->markCompleted(7);
-    ASSERT_TRUE(c4->completed());
-    ASSERT_EQ(c4->value().toInt(), 7);
-    s2->markCompleted(1);
-    ASSERT_EQ(c4->value().toInt(), 7);
-  }
+// collectAny()
+TEST(FuturesTest, CollectAny) {
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
+
+  // Empty case
+  c10::List<intrusive_ptr<ivalue::Future>> futures(
+      FutureType::create(IntType::get()));
+  auto c1 = collectAny(futures);
+  ASSERT_TRUE(c1->completed());
+
+  // 1 element, not yet satisfied
+  futures.push_back(s1);
+  auto c2 = collectAny(futures);
+  ASSERT_FALSE(c2->completed());
+  s1->markCompleted(5);
+  ASSERT_TRUE(c2->completed());
+  ASSERT_TRUE(c2->value().isInt());
+  ASSERT_EQ(c2->value().toInt(), 5);
+
+  // 1 element already satisfied.
+  auto c3 = collectAny(futures);
+  ASSERT_TRUE(c3->completed());
+  ASSERT_TRUE(c3->value().isInt());
+  ASSERT_EQ(c3->value().toInt(), 5);
+
+  // 2 elements
+  futures.clear();
+  auto s2 = c10::make_intrusive<Future>(IntType::get());
+  auto s3 = c10::make_intrusive<Future>(IntType::get());
+  futures.push_back(s2);
+  futures.push_back(s3);
+  auto c4 = collectAny(futures);
+  ASSERT_FALSE(c4->completed());
+  s3->markCompleted(7);
+  ASSERT_TRUE(c4->completed());
+  ASSERT_EQ(c4->value().toInt(), 7);
+  s2->markCompleted(1);
+  ASSERT_EQ(c4->value().toInt(), 7);
 }
 
-void testTLSFutureCallbacks() {
+TEST(TLSFutureCallbacksTest, Basic) {
   // cb that verifies the profiler is enabled
   auto profilerEnabledCb = []() {
     ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
@@ -2184,5 +2158,75 @@ void testTLSFutureCallbacks() {
   }
 }
 
+TEST(ProfilerDisableInCallbackTest, Basic) {
+  // cb that verifies the profiler is enabled
+  auto profilerEnabledCb = []() {
+    ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
+  };
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::CPU, false, false));
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
+  auto verifyProfilerCb = wrapPropagateTLSState<void>([&profilerEnabledCb] {
+    // Ensure the profiler is still enabled in this thread.
+    profilerEnabledCb();
+    auto t1 = torch::ones({2, 2});
+    auto t2 = torch::ones({2, 2});
+    torch::add(t1, t2);
+    // Don't cleanup TLSState, and just consolidate.
+    auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true);
+    auto thread_event_lists =
+        torch::autograd::profiler::disableProfiler(std::move(opts));
+    // Ensure that the events from this thread are still profiled and we obtain
+    // the expected in events in our consolidated list when calling
+    // disableProfiler().
+    bool found_ones = false;
+    bool found_add = false;
+    for (const auto& li : thread_event_lists) {
+      for (const auto& evt : li) {
+        if (strcmp(evt.name(), "aten::add") == 0) {
+          found_add = true;
+        } else if (strcmp(evt.name(), "aten::ones") == 0) {
+          found_ones = true;
+        }
+      }
+      if (found_add && found_ones) {
+        break;
+      }
+    }
+    ASSERT_TRUE(found_ones);
+    ASSERT_TRUE(found_add);
+  });
+
+  s1->addCallback(verifyProfilerCb);
+  // Disable the profiler, but do not consolidate results in the main thread.
+  auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
+  torch::autograd::profiler::disableProfiler(std::move(opts));
+  std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
+  t.join();
+
+  // Similar to above test, but verifies correctness in the case where
+  // continuation runs on the main thread.
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::CPU, false, false));
+  s1 = c10::make_intrusive<Future>(IntType::get());
+  s1->addCallback(verifyProfilerCb);
+  // Runs callback inline
+  s1->markCompleted(at::IValue(1));
+  opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
+  torch::autograd::profiler::disableProfiler(std::move(opts));
+}
+
+TEST(IValueKWargsTest, Basic) {
+  const auto text = R"(
+    def foo(a : int, b : int, c : int = 4):
+      return a + 2*b + 3*c
+  )";
+  auto cu = compile(text);
+  auto result = cu->get_function("foo")({1}, {{"b", 3}});
+  ASSERT_EQ(result.toInt(), 19);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_mobile_type_parser.cpp b/test/cpp/jit/test_mobile_type_parser.cpp
index 989d16794bd2..7e24e5dc65bc 100644
--- a/test/cpp/jit/test_mobile_type_parser.cpp
+++ b/test/cpp/jit/test_mobile_type_parser.cpp
@@ -1,5 +1,6 @@
-#include "test/cpp/jit/test_base.h"
-//#include <gtest.h>
+#include <gtest/gtest.h>
+
+#include <ATen/core/jit_type.h>
 
 namespace c10 {
 // std::string serializeType(const Type &t);
@@ -8,50 +9,74 @@ TypePtr parseType(const std::string& pythonStr);
 
 namespace torch {
 namespace jit {
-void testMobileTypeParser() {
+TEST(MobileTypeParserTest, Empty) {
   std::string empty_ps("");
   ASSERT_ANY_THROW(c10::parseType(empty_ps));
+}
 
+TEST(MobileTypeParserTest, RoundTripAnnotationStr) {
   std::string int_ps("int");
   auto int_tp = c10::parseType(int_ps);
   std::string int_tps = int_tp->annotation_str();
   ASSERT_EQ(int_ps, int_tps);
+}
 
+TEST(MobileTypeParserTest, NestedContainersAnnotationStr) {
   std::string tuple_ps(
       "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]");
   auto tuple_tp = c10::parseType(tuple_ps);
   std::string tuple_tps = tuple_tp->annotation_str();
   ASSERT_EQ(tuple_ps, tuple_tps);
+}
 
+TEST(MobileTypeParserTest, NestedContainersAnnotationStrWithSpaces) {
+  std::string tuple_ps(
+      "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]");
   std::string tuple_space_ps(
       "Tuple[  str, Optional[float], Dict[str, List[Tensor ]]  , int]");
   auto tuple_space_tp = c10::parseType(tuple_space_ps);
   // tuple_space_tps should not have weird white spaces
   std::string tuple_space_tps = tuple_space_tp->annotation_str();
   ASSERT_EQ(tuple_ps, tuple_space_tps);
+}
 
+TEST(MobileTypeParserTest, TypoRaises) {
   std::string typo_token("List[tensor]");
   ASSERT_ANY_THROW(c10::parseType(typo_token));
+}
 
+TEST(MobileTypeParserTest, MismatchBracketRaises) {
   std::string mismatch1("List[Tensor");
   ASSERT_ANY_THROW(c10::parseType(mismatch1));
+}
 
+TEST(MobileTypeParserTest, MismatchBracketRaises2) {
   std::string mismatch2("List[[Tensor]");
   ASSERT_ANY_THROW(c10::parseType(mismatch2));
+}
 
+TEST(MobileTypeParserTest, DictWithoutValueRaises) {
   std::string mismatch3("Dict[Tensor]");
   ASSERT_ANY_THROW(c10::parseType(mismatch3));
+}
 
+TEST(MobileTypeParserTest, ListArgCountMismatchRaises) {
   // arg count mismatch
   std::string mismatch4("List[int, str]");
   ASSERT_ANY_THROW(c10::parseType(mismatch4));
+}
 
+TEST(MobileTypeParserTest, DictArgCountMismatchRaises) {
   std::string trailing_commm("Dict[str,]");
   ASSERT_ANY_THROW(c10::parseType(trailing_commm));
+}
 
+TEST(MobileTypeParserTest, ValidTypeWithExtraStuffRaises) {
   std::string extra_stuff("int int");
   ASSERT_ANY_THROW(c10::parseType(extra_stuff));
+}
 
+TEST(MobileTypeParserTest, NonIdentifierRaises) {
   std::string non_id("(int)");
   ASSERT_ANY_THROW(c10::parseType(non_id));
 }
diff --git a/test/cpp/jit/test_module_api.cpp b/test/cpp/jit/test_module_api.cpp
index 386addd9fbec..910331166d51 100644
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 
 #include <ATen/core/qualified_name.h>
@@ -42,7 +43,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testModuleClone() {
+TEST(ModuleAPITest, Clone) {
   auto cu = std::make_shared<CompilationUnit>();
   // creating child module
   auto child = ClassType::create("child", cu, true);
@@ -71,7 +72,7 @@ void testModuleClone() {
   ASSERT_EQ(Module(p2.attr("c2").toObject()).attr(attr_name).toInt(), 3);
 }
 
-void testModuleCloneWithModuleInterface() {
+TEST(ModuleAPITest, CloneWithModuleInterface) {
   auto cu = std::make_shared<CompilationUnit>();
 
   // define a initial module with two submods share same interface
@@ -115,7 +116,7 @@ void testModuleCloneWithModuleInterface() {
   ASSERT_NE(clonedMod.type(), parentMod.type());
 }
 
-void testModuleCopy() {
+TEST(ModuleAPITest, Copy) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr_name = "attr";
@@ -144,7 +145,7 @@ void testModuleCopy() {
   ASSERT_EQ(m3.attr(attr_name).toInt(), 3);
 }
 
-void testModuleDeepcopy() {
+TEST(ModuleAPITest, DeepCopy) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto str_attr = "str_attr";
@@ -203,7 +204,7 @@ void testModuleDeepcopy() {
   ASSERT_TRUE(t1.equal(t3));
 }
 
-void testModuleDeepcopyString() {
+TEST(ModuleAPITest, DeepCopyString) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr1 = "attr1";
@@ -219,7 +220,7 @@ void testModuleDeepcopyString() {
   ASSERT_EQ(copied.attr(attr1).toString()->string(), original_str);
 }
 
-void testModuleDeepcopyAliasing() {
+TEST(ModuleAPITest, DeepCopyPreservesAliasing) {
   // check deepcopy preserves aliasing
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
@@ -256,7 +257,7 @@ void testModuleDeepcopyAliasing() {
   ASSERT_TRUE(copied_attr3.isAliasOf(copied_attr4));
 }
 
-void testModuleConstant() {
+TEST(ModuleAPITest, Constants) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr_name = "attr";
@@ -272,7 +273,7 @@ void testModuleConstant() {
   ASSERT_EQ(m.attr(const_name).toInt(), 3);
 }
 
-void testModuleParameter() {
+TEST(ModuleAPITest, Parameters) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   Module m(cu, cls);
@@ -291,5 +292,39 @@ void testModuleParameter() {
   ASSERT_TRUE(m.hasattr("none_param2"));
 }
 
+TEST(ModuleAPITest, Define) {
+  Module m("m");
+  m.register_parameter("foo", torch::ones({}), false);
+  m.define(R"(
+    def add_it(self, x, b : int = 4):
+      return self.foo + x + b
+  )");
+  auto result = m.run_method("add_it", torch::ones({}));
+  AT_ASSERT(result.toTensor().item<float>() == 6);
+}
+
+TEST(ModuleAPITest, To_CUDA) {
+  Module m("test");
+  {
+    // test cuda to cpu for params and buffers
+    m.register_parameter("foo", torch::ones({}, at::kCUDA), false);
+    m.register_buffer("bar", torch::ones({}, at::kCUDA));
+
+    m.to(at::kCUDA);
+    m.to(at::kCPU);
+    AT_ASSERT(m.attr("foo").toTensor().device().is_cpu());
+    AT_ASSERT(m.attr("bar").toTensor().device().is_cpu());
+  }
+  {
+    // test cpu to cuda for params and buffers
+    m.register_parameter("foo", torch::ones({}), false);
+    m.register_buffer("bar", torch::ones({}));
+
+    m.to(at::kCUDA);
+    AT_ASSERT(m.attr("foo").toTensor().device().is_cuda());
+    AT_ASSERT(m.attr("bar").toTensor().device().is_cuda());
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_peephole_optimize.cpp b/test/cpp/jit/test_peephole_optimize.cpp
index 5382d556613d..9985faa6e9bd 100644
--- a/test/cpp/jit/test_peephole_optimize.cpp
+++ b/test/cpp/jit/test_peephole_optimize.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 
 #include <torch/csrc/jit/ir/ir.h>
@@ -8,47 +9,48 @@
 namespace torch {
 namespace jit {
 
-void testPeepholeOptimize() {
-  // test is / is not none optimization
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, IsAndIsNot)
+// test is / is not none optimization
+{
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0 : int):
   %1 : None = prim::Constant()
   %2 : bool = aten::__is__(%0, %1)
   %3 : bool = aten::__isnot__(%0, %1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check_not("aten::__is__")
-        ->check_not("aten::__isnot__")
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check_not("aten::__is__")
+      ->check_not("aten::__isnot__")
+      ->run(*graph);
+}
+
+TEST(PeepholeOptimizeTest, IsAndIsNot2) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0: int?):
   %1 : None = prim::Constant()
   %2 : bool = aten::__is__(%0, %1)
   %3 : bool = aten::__isnot__(%0, %1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check("aten::__is__")
-        ->check("aten::__isnot__")
-        ->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check("aten::__is__")
+      ->check("aten::__isnot__")
+      ->run(*graph);
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, IsAndIsNot3) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0: int?):
   %1 : Tensor = prim::AutogradZero()
   %2 : None = prim::Constant()
@@ -56,48 +58,49 @@ graph(%0: int?):
   %5 : bool = aten::__isnot__(%1, %2)
   return (%4, %5)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check("aten::__is__")
-        ->check_not("aten::__isnot__")
-        ->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check("aten::__is__")
+      ->check_not("aten::__isnot__")
+      ->run(*graph);
+}
 
-  // test unwrap optional
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, UnwrapOptional)
+// test unwrap optional
+{
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %1 : Float(*, *, *) = prim::Constant()
   %2 : bool = aten::_unwrap_optional(%1)
   %3 : bool = prim::unchecked_unwrap_optional(%1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck().check_not("unwrap")->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck().check_not("unwrap")->run(*graph);
+}
+
+TEST(PeepholeOptimizeTest, UnwrapOptional2) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%1 : Float(*, *, *)?):
   %2 : bool = aten::_unwrap_optional(%1)
   %3 : bool = prim::unchecked_unwrap_optional(%1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck().check_count("unwrap", 2)->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck().check_count("unwrap", 2)->run(*graph);
+}
 
-  // tests addmm fusion
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, AddMMFusion) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
       graph(
         %0 : Float(2, 3, 4),
         %1 : Float(2, 3, 4),
@@ -108,10 +111,9 @@ graph(%1 : Float(*, *, *)?):
         %6 : Tensor = aten::add(%5, %2, %3)
         return (%6)
         )IR",
-        graph.get());
-    FuseAddMM(graph);
-    testing::FileCheck().check("addmm")->run(*graph);
-  }
+      graph.get());
+  FuseAddMM(graph);
+  testing::FileCheck().check("addmm")->run(*graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_qualified_name.cpp b/test/cpp/jit/test_qualified_name.cpp
index 0f387bb542ed..80028ada8565 100644
--- a/test/cpp/jit/test_qualified_name.cpp
+++ b/test/cpp/jit/test_qualified_name.cpp
@@ -1,68 +1,70 @@
+#include <gtest/gtest.h>
 
 #include <ATen/core/qualified_name.h>
 #include <c10/util/Exception.h>
-#include "test/cpp/jit/test_base.h"
 
 using c10::QualifiedName;
 
 namespace torch {
 namespace jit {
-void testQualifiedName() {
-  {
-    // Test prefix construction
-    auto foo = QualifiedName("foo");
-    auto bar = QualifiedName(foo, "bar");
-    auto baz = QualifiedName(bar, "baz");
-    ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz");
-    ASSERT_EQ(baz.prefix(), "foo.bar");
-    ASSERT_EQ(baz.name(), "baz");
-    auto nullstate = QualifiedName();
-    ASSERT_EQ(nullstate.qualifiedName(), "");
-    ASSERT_EQ(nullstate.prefix(), "");
-    ASSERT_EQ(nullstate.name(), "");
-  }
-  {
-    // Test dotted construction
-    auto foo = QualifiedName("foo.bar.baz");
-    ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz");
-    ASSERT_EQ(foo.prefix(), "foo.bar");
-    ASSERT_EQ(foo.name(), "baz");
+TEST(QualifiedNameTest, PrefixConstruction) {
+  // Test prefix construction
+  auto foo = QualifiedName("foo");
+  auto bar = QualifiedName(foo, "bar");
+  auto baz = QualifiedName(bar, "baz");
+  ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz");
+  ASSERT_EQ(baz.prefix(), "foo.bar");
+  ASSERT_EQ(baz.name(), "baz");
+  auto nullstate = QualifiedName();
+  ASSERT_EQ(nullstate.qualifiedName(), "");
+  ASSERT_EQ(nullstate.prefix(), "");
+  ASSERT_EQ(nullstate.name(), "");
+}
+
+TEST(QualifiedNameTest, DottedConstruction) {
+  // Test dotted construction
+  auto foo = QualifiedName("foo.bar.baz");
+  ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz");
+  ASSERT_EQ(foo.prefix(), "foo.bar");
+  ASSERT_EQ(foo.name(), "baz");
+
+  auto bar = QualifiedName("bar");
+  ASSERT_EQ(bar.qualifiedName(), "bar");
+  ASSERT_EQ(bar.prefix(), "");
+  ASSERT_EQ(bar.name(), "bar");
+}
+
+TEST(QualifiedNameTest, BadInputRaises) {
+  // throw some bad inputs at it
+  ASSERT_ANY_THROW(QualifiedName("foo..bar"));
+  ASSERT_ANY_THROW(QualifiedName(".foo.bar"));
+  ASSERT_ANY_THROW(QualifiedName("foo.bar."));
+  ASSERT_ANY_THROW(QualifiedName(""));
+}
+
+TEST(QualifiedNameTest, Equality) {
+  // test equality api
+  auto foo1 = QualifiedName("foo.bar.baz");
+  auto foo2 = QualifiedName("foo.bar.baz");
+  auto foo3 = QualifiedName("bar.bar.baz");
+  ASSERT_EQ(foo1, foo2);
+  ASSERT_NE(foo1, foo3);
+  auto bar1 = QualifiedName("sup");
+  auto bar2 = QualifiedName("sup");
+  ASSERT_EQ(foo1, foo2);
+}
 
-    auto bar = QualifiedName("bar");
-    ASSERT_EQ(bar.qualifiedName(), "bar");
-    ASSERT_EQ(bar.prefix(), "");
-    ASSERT_EQ(bar.name(), "bar");
-  }
-  {
-    // throw some bad inputs at it
-    ASSERT_ANY_THROW(QualifiedName("foo..bar"));
-    ASSERT_ANY_THROW(QualifiedName(".foo.bar"));
-    ASSERT_ANY_THROW(QualifiedName("foo.bar."));
-    ASSERT_ANY_THROW(QualifiedName(""));
-  }
-  {
-    // test equality api
-    auto foo1 = QualifiedName("foo.bar.baz");
-    auto foo2 = QualifiedName("foo.bar.baz");
-    auto foo3 = QualifiedName("bar.bar.baz");
-    ASSERT_EQ(foo1, foo2);
-    ASSERT_NE(foo1, foo3);
-    auto bar1 = QualifiedName("sup");
-    auto bar2 = QualifiedName("sup");
-    ASSERT_EQ(foo1, foo2);
-  }
-  {
-    // test prefix api
-    auto foo1 = QualifiedName("foo.bar.baz");
-    auto foo2 = QualifiedName("foo.bar");
-    auto foo3 = QualifiedName("bar.bar.baz");
-    auto foo4 = QualifiedName("foo.bar");
-    ASSERT_TRUE(foo2.isPrefixOf(foo1));
-    ASSERT_TRUE(foo2.isPrefixOf(foo4));
-    ASSERT_TRUE(foo4.isPrefixOf(foo2));
-    ASSERT_FALSE(foo1.isPrefixOf(foo2));
-    ASSERT_FALSE(foo2.isPrefixOf(foo3));
-  }
+TEST(QualifiedNameTest, IsPrefixOf) {
+  // test prefix api
+  auto foo1 = QualifiedName("foo.bar.baz");
+  auto foo2 = QualifiedName("foo.bar");
+  auto foo3 = QualifiedName("bar.bar.baz");
+  auto foo4 = QualifiedName("foo.bar");
+  ASSERT_TRUE(foo2.isPrefixOf(foo1));
+  ASSERT_TRUE(foo2.isPrefixOf(foo4));
+  ASSERT_TRUE(foo4.isPrefixOf(foo2));
+  ASSERT_FALSE(foo1.isPrefixOf(foo2));
+  ASSERT_FALSE(foo2.isPrefixOf(foo3));
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 05940845d172..2e59358b4e00 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <sstream>
 
@@ -12,10 +13,10 @@
 namespace torch {
 namespace jit {
 
-// Tests that an extra file written explicitly has precedence over
-//   extra files written by a hook
-// TODO: test for the warning, too
-void testExtraFilesHookPreference() {
+TEST(SerializationTest, ExtraFilesHookPreference) {
+  // Tests that an extra file written explicitly has precedence over
+  //   extra files written by a hook
+  // TODO: test for the warning, too
   const auto script = R"JIT(
     def forward(self):
         x = torch.rand(5, 5)
@@ -43,52 +44,50 @@ void testExtraFilesHookPreference() {
   ASSERT_EQ(loaded_extra_files["metadata.json"], "abc");
 }
 
-void testSaveExtraFilesHook() {
+TEST(SerializationTest, ExtraFileHooksNoSecret) {
   // no secrets
+  std::stringstream ss;
+  {
+    Module m("__torch__.m");
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "abc";
+    m.save(ss, extra);
+  }
+  ss.seekg(0);
+  {
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "";
+    extra["secret.json"] = "";
+    jit::load(ss, c10::nullopt, extra);
+    ASSERT_EQ(extra["metadata.json"], "abc");
+    ASSERT_EQ(extra["secret.json"], "");
+  }
+}
+
+TEST(SerializationTest, ExtraFileHooksWithSecret) {
+  std::stringstream ss;
   {
-    std::stringstream ss;
-    {
-      Module m("__torch__.m");
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "abc";
-      m.save(ss, extra);
-    }
-    ss.seekg(0);
-    {
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "";
-      extra["secret.json"] = "";
-      jit::load(ss, c10::nullopt, extra);
-      ASSERT_EQ(extra["metadata.json"], "abc");
-      ASSERT_EQ(extra["secret.json"], "");
-    }
+    SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap {
+      return {{"secret.json", "topsecret"}};
+    });
+    Module m("__torch__.m");
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "abc";
+    m.save(ss, extra);
+    SetExportModuleExtraFilesHook(nullptr);
   }
-  // some secret
+  ss.seekg(0);
   {
-    std::stringstream ss;
-    {
-      SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap {
-        return {{"secret.json", "topsecret"}};
-      });
-      Module m("__torch__.m");
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "abc";
-      m.save(ss, extra);
-      SetExportModuleExtraFilesHook(nullptr);
-    }
-    ss.seekg(0);
-    {
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "";
-      extra["secret.json"] = "";
-      jit::load(ss, c10::nullopt, extra);
-      ASSERT_EQ(extra["metadata.json"], "abc");
-      ASSERT_EQ(extra["secret.json"], "topsecret");
-    }
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "";
+    extra["secret.json"] = "";
+    jit::load(ss, c10::nullopt, extra);
+    ASSERT_EQ(extra["metadata.json"], "abc");
+    ASSERT_EQ(extra["secret.json"], "topsecret");
   }
 }
 
-void testTypeTags() {
+TEST(SerializationTest, TypeTags) {
   auto list = c10::List<c10::List<int64_t>>();
   list.push_back(c10::List<int64_t>({1, 2, 3}));
   list.push_back(c10::List<int64_t>({4, 5, 6}));
diff --git a/test/cpp/jit/test_schema_matching.cpp b/test/cpp/jit/test_schema_matching.cpp
index bea7d14dcaf2..aeeb173b2678 100644
--- a/test/cpp/jit/test_schema_matching.cpp
+++ b/test/cpp/jit/test_schema_matching.cpp
@@ -1,8 +1,9 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/jit.h>
-#include "test/cpp/jit/test_base.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
 
 #include <sstream>
 #include <string>
@@ -10,80 +11,79 @@
 namespace torch {
 namespace jit {
 
-void testSchemaMatching() {
-  {
-    RegisterOperators reg({
-        Operator(
-            "aten::test_vartype(t[] a, t b) -> (t)",
-            [](Stack* stack) {
-              c10::List<double> list;
-              double a;
-              pop(stack, list, a);
-              push(stack, a);
-            },
-            c10::AliasAnalysisKind::FROM_SCHEMA),
-    });
-    Module m("m");
-    m.define(R"(
+TEST(SchemaMatchingTest, VarType) {
+  RegisterOperators reg({
+      Operator(
+          "aten::test_vartype(t[] a, t b) -> (t)",
+          [](Stack* stack) {
+            c10::List<double> list;
+            double a;
+            pop(stack, list, a);
+            push(stack, a);
+          },
+          c10::AliasAnalysisKind::FROM_SCHEMA),
+  });
+  Module m("m");
+  m.define(R"(
       def test(self):
         a = (1.0, 2.0)
         return torch.test_vartype(a, 2.0)
     )");
-    auto result = m.run_method("test");
-    TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0);
+  auto result = m.run_method("test");
+  TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0);
 
-    const std::string error_example = R"JIT(
+  const std::string error_example = R"JIT(
       def test_2(self):
           a = (1.0, 2.0)
           non_float = (1, 1)
           return torch.test_vartype(a, non_float)
     )JIT";
 
-    std::string err = "";
-    try {
-      m.define(error_example);
-    } catch (const std::exception& e) {
-      err = e.what();
-    }
-    TORCH_INTERNAL_ASSERT(
-        err.find("previously matched to type") != std::string::npos);
+  std::string err = "";
+  try {
+    m.define(error_example);
+  } catch (const std::exception& e) {
+    err = e.what();
   }
-  {
-    RegisterOperators reg({
-        Operator(
-            "aten::test_vartype2(t a, t[] b) -> (t[])",
-            [](Stack* stack) {
-              double a;
-              c10::List<double> list;
-              pop(stack, a, list);
-              push(stack, a);
-            },
-            AliasAnalysisKind::FROM_SCHEMA),
-    });
-    Module m("m");
-    m.define(R"JIT(
+  TORCH_INTERNAL_ASSERT(
+      err.find("previously matched to type") != std::string::npos);
+}
+
+TEST(SchemaMatchingTest, VarType2) {
+  RegisterOperators reg({
+      Operator(
+          "aten::test_vartype2(t a, t[] b) -> (t[])",
+          [](Stack* stack) {
+            double a;
+            c10::List<double> list;
+            pop(stack, a, list);
+            push(stack, a);
+          },
+          AliasAnalysisKind::FROM_SCHEMA),
+  });
+  Module m("m");
+  m.define(R"JIT(
       def test(self):
           a = (1.0, 2.0)
           return torch.test_vartype2(3.0, a)
     )JIT");
-    auto result = m.run_method("test");
-    TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0);
+  auto result = m.run_method("test");
+  TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0);
 
-    static const auto error_exam2 = R"JIT(
+  static const auto error_exam2 = R"JIT(
       def test_2(self):
           a = (1, 2)
           return torch.test_vartype2(3.0, a)
     )JIT";
 
-    std::string err = "";
-    try {
-      m.define(error_exam2);
-    } catch (const std::exception& e) {
-      err = e.what();
-    }
-    TORCH_INTERNAL_ASSERT(
-        err.find("previously matched to type") != std::string::npos);
+  std::string err = "";
+  try {
+    m.define(error_exam2);
+  } catch (const std::exception& e) {
+    err = e.what();
   }
+  TORCH_INTERNAL_ASSERT(
+      err.find("previously matched to type") != std::string::npos);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_matcher.cpp b/test/cpp/jit/test_subgraph_matcher.cpp
index 2e398db44e95..39078d345269 100644
--- a/test/cpp/jit/test_subgraph_matcher.cpp
+++ b/test/cpp/jit/test_subgraph_matcher.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/ir/subgraph_matcher.h"
 
 namespace torch {
 namespace jit {
 
-void testTrivial1() {
+TEST(SubgraphMatcherTest, Trivial1) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -22,7 +23,7 @@ graph(%0):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testTrivial2() {
+TEST(SubgraphMatcherTest, Trivial2) {
   Graph graph;
   auto* g_in = graph.addInput();
   auto* g_tanh = graph.insertNode(graph.create(aten::tanh, /*num_outputs =*/1));
@@ -45,7 +46,7 @@ void testTrivial2() {
   }
 }
 
-void testTrivial3() {
+TEST(SubgraphMatcherTest, Trivial3) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -64,7 +65,7 @@ graph(%a, %b):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testTrivial4() {
+TEST(SubgraphMatcherTest, Trivial4) {
   Graph graph;
   auto* g_in0 = graph.addInput();
   auto* g_in1 = graph.addInput();
@@ -92,7 +93,7 @@ void testTrivial4() {
   }
 }
 
-void testLinear1() {
+TEST(SubgraphMatcherTest, Linear1) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -114,7 +115,7 @@ graph(%0):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testLinear2() {
+TEST(SubgraphMatcherTest, Linear2) {
   Graph graph;
   auto* g_in = graph.addInput();
 
@@ -164,7 +165,7 @@ void testLinear2() {
  *      |
  *     eee
  */
-void testDiamond1() {
+TEST(SubgraphMatcherTest, Diamond1) {
   Graph graph, pattern1, pattern2;
   parseIR(
       R"IR(
@@ -215,7 +216,7 @@ graph(%0):
  *      |
  *      o1
  */
-void testDiamond2() {
+TEST(SubgraphMatcherTest, Diamond2) {
   Graph graph;
   auto* g_in = graph.addInput();
 
@@ -253,7 +254,7 @@ void testDiamond2() {
   }
 }
 
-void testXPattern() {
+TEST(SubgraphMatcherTest, XPattern) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -280,7 +281,7 @@ graph(%0, %1):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testMultipleMatches() {
+TEST(SubgraphMatcherTest, MultipleMatches) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -301,7 +302,7 @@ graph(%t0):
   AT_ASSERT(matches.size() == 4);
 }
 
-void testOverlappingMatches() {
+TEST(SubgraphMatcherTest, OverlappingMatches) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -323,7 +324,7 @@ graph(%t0):
   AT_ASSERT(matches.size() == 3);
 }
 
-void testMatchInBasicBlocks1() {
+TEST(SubgraphMatcherTest, MatchInBasicBlocks1) {
   Graph graph;
   parseIR(
       R"IR(
@@ -360,7 +361,7 @@ graph(%x, %y):
   AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0);
 }
 
-void testMatchInBasicBlocks2() {
+TEST(SubgraphMatcherTest, MatchInBasicBlocks2) {
   Graph graph;
   parseIR(
       R"IR(
@@ -395,7 +396,7 @@ graph(%x, %y):
   AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0);
 }
 
-void testMatchesAttributes() {
+TEST(SubgraphMatcherTest, MatchesAttributes) {
   Graph graph;
   parseIR(
       R"IR(
@@ -479,7 +480,7 @@ graph(%a, %b):
   }
 }
 
-void testBadPattern() {
+TEST(SubgraphMatcherTest, BadPattern) {
   Graph graph, pattern1, pattern2;
   parseIR(
       R"IR(
@@ -509,23 +510,5 @@ graph(%x):
   ASSERT_ANY_THROW(findPatternMatches(pattern2, graph));
 }
 
-void testSubgraphMatching() {
-  testTrivial1();
-  testTrivial2();
-  testTrivial3();
-  testTrivial4();
-  testLinear1();
-  testLinear2();
-  testDiamond1();
-  testDiamond2();
-  testXPattern();
-  testMultipleMatches();
-  testOverlappingMatches();
-  testMatchInBasicBlocks1();
-  testMatchInBasicBlocks2();
-  testMatchesAttributes();
-  testBadPattern();
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_rewriter.cpp b/test/cpp/jit/test_subgraph_rewriter.cpp
index 9799dfdb97b2..f166962ebc5c 100644
--- a/test/cpp/jit/test_subgraph_rewriter.cpp
+++ b/test/cpp/jit/test_subgraph_rewriter.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
@@ -8,7 +9,7 @@ namespace torch {
 namespace jit {
 using namespace testing;
 
-void testFilterMatch() {
+TEST(SubgraphRewriterTest, FilterMatch) {
   auto graph = std::make_shared<Graph>();
 
   parseIR(
@@ -80,7 +81,7 @@ graph(%a, %b):
   }
 }
 
-void testFilterNoMatch() {
+TEST(SubgraphRewriterTest, FilterNoMatch) {
   auto graph = std::make_shared<Graph>();
   parseIR(
       R"IR(
@@ -121,10 +122,5 @@ graph(%a, %b):
   FileCheck().check("c::ccc")->check_not("d::ddd")->run(*graph);
 }
 
-void testSubgraphRewriter() {
-  testFilterMatch();
-  testFilterNoMatch();
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_utils.cpp b/test/cpp/jit/test_subgraph_utils.cpp
index e1f86cc34979..09e01f8836da 100644
--- a/test/cpp/jit/test_subgraph_utils.cpp
+++ b/test/cpp/jit/test_subgraph_utils.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -7,7 +8,7 @@
 namespace torch {
 namespace jit {
 
-void testSubgraphUtils() {
+TEST(SubgraphUtilsTest, Basic) {
   auto graph = build_lstm();
   EliminateCommonSubexpression(graph);
 
@@ -37,7 +38,7 @@ void testSubgraphUtils() {
   ASSERT_EQ(originalNodes.size(), newNodes.size());
 }
 
-void testSubgraphUtilsVmap() {
+TEST(SubgraphUtilsTest, Vmap) {
   auto graph = std::make_shared<Graph>();
 
   std::unordered_map<std::string, Value*> parse_map;
diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp
index d87e8201615d..6f626756db74 100644
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@@ -1,6 +1,9 @@
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
 
 namespace torch {
 namespace jit {
@@ -137,5 +140,22 @@ std::pair<at::Tensor, at::Tensor> lstm(
   return {hy, cy};
 }
 
+inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+namespace {
+RegisterOperators reg({
+    // This operator is intended to be used in JIT analysis and transformation
+    // pass unit tests in which Values with type Tensor are often required. It
+    // should not be used in situations in which the graph is actually executed
+    // because it always produces empty Tensors.
+    Operator(
+        "prim::MakeTestTensor() -> Tensor",
+        [](Stack* stack) { push(stack, at::Tensor()); },
+        aliasAnalysisFromSchema()),
+});
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h
index 6e6b82fff442..109f7253deea 100644
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 #include "torch/csrc/jit/ir/irparser.h"
 #include "torch/csrc/jit/runtime/autodiff.h"
 #include "torch/csrc/jit/runtime/interpreter.h"
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
deleted file mode 100644
index df45054edc43..000000000000
--- a/test/cpp/jit/tests.h
+++ /dev/null
@@ -1,246 +0,0 @@
-#pragma once
-
-/**
- * See README.md for instructions on how to add a new test.
- */
-#include <c10/macros/Export.h>
-#include <torch/csrc/WindowsTorchApiMacro.h>
-
-namespace torch {
-namespace jit {
-#define TH_FORALL_TESTS(_)                        \
-  _(ADFormulas)                                   \
-  _(Attributes)                                   \
-  _(Blocks)                                       \
-  _(CallStack)                                    \
-  _(CallStackCaching)                             \
-  _(CodeTemplate)                                 \
-  _(ControlFlow)                                  \
-  _(CreateAutodiffSubgraphs)                      \
-  _(CustomOperators)                              \
-  _(CustomOperatorAliasing)                       \
-  _(TemplatedOperatorCreator)                     \
-  _(IValueKWargs)                                 \
-  _(CustomFusion)                                 \
-  _(SchemaMatching)                               \
-  _(Differentiate)                                \
-  _(DifferentiateWithRequiresGrad)                \
-  _(FromQualString)                               \
-  _(InternedStrings)                              \
-  _(PassManagement)                               \
-  _(Proto)                                        \
-  _(RegisterFusionCachesKernel)                   \
-  _(SchemaParser)                                 \
-  _(TopologicalIndex)                             \
-  _(SubgraphUtils)                                \
-  _(SubgraphUtilsVmap)                            \
-  _(IRParser)                                     \
-  _(ConstantPooling)                              \
-  _(CleanUpPasses)                                \
-  _(THNNConv)                                     \
-  _(ATenNativeBatchNorm)                          \
-  _(NoneSchemaMatch)                              \
-  _(ClassParser)                                  \
-  _(UnifyTypes)                                   \
-  _(Profiler)                                     \
-  _(FallbackGraphs)                               \
-  _(InsertAndEliminateRedundantGuards)            \
-  _(LoopPeeler)                                   \
-  _(InsertBailOuts)                               \
-  _(PeepholeOptimize)                             \
-  _(RecordFunction)                               \
-  _(ThreadLocalDebugInfo)                         \
-  _(SubgraphMatching)                             \
-  _(SubgraphRewriter)                             \
-  _(ModuleClone)                                  \
-  _(ModuleConstant)                               \
-  _(ModuleParameter)                              \
-  _(ModuleCopy)                                   \
-  _(ModuleDeepcopy)                               \
-  _(ModuleDeepcopyString)                         \
-  _(ModuleDeepcopyAliasing)                       \
-  _(ModuleDefine)                                 \
-  _(QualifiedName)                                \
-  _(ClassImport)                                  \
-  _(ScriptObject)                                 \
-  _(ExtraFilesHookPreference)                     \
-  _(SaveExtraFilesHook)                           \
-  _(TypeTags)                                     \
-  _(DCE)                                          \
-  _(CustomFusionNestedBlocks)                     \
-  _(ClassDerive)                                  \
-  _(SaveLoadTorchbind)                            \
-  _(ModuleInterfaceSerialization)                 \
-  _(ModuleCloneWithModuleInterface)               \
-  _(ClassTypeAddRemoveAttr)                       \
-  _(Inliner)                                      \
-  _(LiteInterpreterAdd)                           \
-  _(LiteInterpreterConv)                          \
-  _(LiteInterpreterInline)                        \
-  _(LiteInterpreterTuple)                         \
-  _(LiteInterpreterUpsampleNearest2d)             \
-  _(CommonAncestor)                               \
-  _(AutogradSymbols)                              \
-  _(DefaultArgTypeHinting)                        \
-  _(Futures)                                      \
-  _(TLSFutureCallbacks)                           \
-  _(MobileTypeParser)                             \
-  _(LiteInterpreterBuiltinFunction)               \
-  _(LiteInterpreterPrim)                          \
-  _(LiteInterpreterLoadOrigJit)                   \
-  _(LiteInterpreterWrongMethodName)               \
-  _(LiteInterpreterParams)                        \
-  _(LiteInterpreterSetState)                      \
-  _(LiteInterpreterModuleInfoBasic)               \
-  _(LiteInterpreterNotSavingModuleInfo)           \
-  _(LiteInterpreterOneSubmoduleModuleInfo)        \
-  _(LiteInterpreterTwoSubmodulesModuleInfo)       \
-  _(LiteInterpreterSequentialModuleInfo)          \
-  _(LiteInterpreterHierarchyModuleInfo)           \
-  _(LiteInterpreterDuplicatedClassTypeModuleInfo) \
-  _(LiteInterpreterEval)                          \
-  _(TorchbindIValueAPI)                           \
-  _(LiteInterpreterDict)                          \
-  _(LiteInterpreterFindAndRunMethod)              \
-  _(LiteInterpreterFindWrongMethodName)           \
-  _(MobileNamedParameters)                        \
-  _(MobileSaveLoadData)                           \
-  _(MobileSaveLoadParameters)                     \
-  _(MobileSaveLoadParametersEmpty)                \
-  _(LiteSGD)                                      \
-  _(LiteSequentialSampler)                        \
-  _(FusionAliasing)
-
-#if defined(USE_CUDA)
-#define TH_FORALL_TESTS_CUDA(_)                     \
-  _(ArgumentSpec)                                   \
-  _(CompleteArgumentSpec)                           \
-  _(Fusion)                                         \
-  _(GraphExecutor)                                  \
-  _(ModuleConversion)                               \
-  _(Interp)                                         \
-  _(TypeCheck)                                      \
-  _(GPU_IrGraphGenerator)                           \
-  _(GPU_FusionDispatch)                             \
-  _(GPU_FusionClear)                                \
-  _(GPU_FusionCopy)                                 \
-  _(GPU_FusionMove)                                 \
-  _(GPU_FusionSimpleArith)                          \
-  _(GPU_FusionExprEvalConstants)                    \
-  _(GPU_FusionExprEvalBindings)                     \
-  _(GPU_FusionExprEvalBasic)                        \
-  _(GPU_FusionExprEvalComplex)                      \
-  _(GPU_FusionExprEvalPostLower)                    \
-  _(GPU_FusionSimpleTypePromote)                    \
-  _(GPU_FusionMutator)                              \
-  _(GPU_FusionRegister)                             \
-  _(GPU_FusionTopoSort)                             \
-  _(GPU_FusionTensor)                               \
-  _(GPU_FusionFilterVals)                           \
-  _(GPU_FusionTVSplit)                              \
-  _(GPU_FusionTVMerge)                              \
-  _(GPU_FusionTVReorder)                            \
-  _(GPU_FusionEquality)                             \
-  _(GPU_FusionParser)                               \
-  _(GPU_FusionDependency)                           \
-  _(GPU_FusionCodeGen)                              \
-  _(GPU_FusionCodeGen2)                             \
-  _(GPU_FusionSimplePWise)                          \
-  _(GPU_FusionExecKernel)                           \
-  _(GPU_FusionForLoop)                              \
-  _(GPU_FusionLoopUnroll)                           \
-  _(GPU_FusionUnaryOps)                             \
-  _(GPU_FusionBinaryOps)                            \
-  _(GPU_FusionTernaryOps)                           \
-  _(GPU_FusionCompoundOps)                          \
-  _(GPU_FusionCastOps)                              \
-  _(GPU_FusionAdvancedComputeAt)                    \
-  _(GPU_FusionScalarInputs)                         \
-  _(GPU_FusionRFactorReplay)                        \
-  _(GPU_FusionReduction)                            \
-  _(GPU_FusionReduction2)                           \
-  _(GPU_FusionReduction3)                           \
-  _(GPU_FusionReduction4)                           \
-  _(GPU_FusionReduction5)                           \
-  _(GPU_FusionReductionTFT)                         \
-  _(GPU_FusionSimpleBCast)                          \
-  _(GPU_FusionComplexBCast)                         \
-  _(GPU_FusionAdvancedIndexing)                     \
-  _(GPU_FusionSimpleGemm)                           \
-  _(GPU_FusionSoftmax1D)                            \
-  _(GPU_FusionSoftmax1DNormalized)                  \
-  _(GPU_FusionSoftmax3D)                            \
-  _(GPU_FusionSoftmax3DNormalized)                  \
-  _(GPU_FusionSoftmaxComputeAt)                     \
-  _(GPU_FusionGridReduction1)                       \
-  _(GPU_FusionGridReduction2)                       \
-  _(GPU_FusionGridReduction3dim1)                   \
-  _(GPU_FusionGridReduction3dim0)                   \
-  _(GPU_FusionGridReduction4)                       \
-  _(GPU_FusionGridReduction5)                       \
-  _(GPU_FusionGridReduction6)                       \
-  _(GPU_FusionNonRedAxisBind)                       \
-  _(GPU_FusionBCastInnerDim)                        \
-  _(GPU_FusionBCastReduce)                          \
-  _(GPU_FusionSplitBCast)                           \
-  _(GPU_FusionComputeAtExprOrder)                   \
-  _(GPU_FusionZeroDimComputeAt)                     \
-  _(GPU_FusionZeroDimBroadcast)                     \
-  _(GPU_FusionZeroDimReduction)                     \
-  _(GPU_FusionReductionMultiConsumer)               \
-  _(GPU_FusionBCastAfterReduce)                     \
-  _(GPU_FusionReductionScheduler)                   \
-  _(GPU_FusionReductionSchedulerMultiDimNonFastest) \
-  _(GPU_FusionReductionSchedulerMultiDimFastest)    \
-  _(GPU_FusionReductionSchedulerDimShmoo)           \
-  _(GPU_FusionCacheBefore)                          \
-  _(GPU_FusionCacheAfter)                           \
-  _(GPU_FusionCacheIndirect)                        \
-  _(GPU_FusionCacheBcast)                           \
-  _(GPU_FusionCacheComplex)                         \
-  _(GPU_FusionCacheMultiConsumer)                   \
-  _(GPU_FusionSmem)                                 \
-  _(GPU_FusionSmemReduce)                           \
-  _(GPU_FusionSmemBlockGemm)                        \
-  _(GPU_FusionSmemBlockGemmCache)                   \
-  _(GPU_FusionConstCheck)                           \
-  _(GPU_FusionSymbolicReduction)                    \
-  _(GPU_FusionUnrollWithAlloc)                      \
-  _(GPU_FusionIsZeroInt)                            \
-  _(GPU_FusionIsOneInt)                             \
-  _(GPU_FusionComputeAtNonterminatingOutput)        \
-  _(GPU_FusionTraversalOrder1)                      \
-  _(GPU_FusionTraversalOrder2)                      \
-  _(GPU_FusionTraversalOrder3)                      \
-  _(GPU_FusionTraversalOrder4)                      \
-  _(GPU_FusionTraversalOrder5)                      \
-  _(GPU_FusionTraversalOrder6)                      \
-  _(GPU_FusionTraversalOrder7)                      \
-  _(GPU_FusionBranches)                             \
-  _(GPU_FusionThreadPredicate)
-#else
-#define TH_FORALL_TESTS_CUDA(_) \
-  _(ArgumentSpec)               \
-  _(CompleteArgumentSpec)       \
-  _(Fusion)                     \
-  _(GraphExecutor)              \
-  _(ModuleConversion)           \
-  _(Interp)                     \
-  _(TypeCheck)
-#endif
-
-#define DECLARE_JIT_TEST(name) void test##name();
-TH_FORALL_TESTS(DECLARE_JIT_TEST)
-TH_FORALL_TESTS_CUDA(DECLARE_JIT_TEST)
-#undef DECLARE_JIT_TEST
-
-// This test is special since it requires prior setup in python.
-// So it is not part of the general test list (which is shared between the gtest
-// and python test runners), but is instead invoked manually by the
-// torch_python_test.cpp
-void testEvalModeForLoadedModule();
-void testSerializationInterop();
-void testTorchSaveError();
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/rpc/test_e2e_process_group.cpp b/test/cpp/rpc/test_e2e_process_group.cpp
index d509a4606fa1..7c5af57d6a09 100644
--- a/test/cpp/rpc/test_e2e_process_group.cpp
+++ b/test/cpp/rpc/test_e2e_process_group.cpp
@@ -19,6 +19,7 @@ class TestE2EProcessGroup : public TestE2EBase {
     options.devices.push_back(
         ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
     std::chrono::milliseconds rpcTimeout(30000);
+    options.timeout = rpcTimeout;
 
     // Initialize server rpc agent.
     auto pg =
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index a2922045adff..af4299e395cd 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -12,29 +12,45 @@ add_executable(test_tensorexpr
 target_link_libraries(test_tensorexpr PRIVATE torch gtest)
 target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 
+add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
+target_link_libraries(tutorial_tensorexpr PRIVATE torch)
+target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
+
+
 if(USE_CUDA)
   target_link_libraries(test_tensorexpr PRIVATE
     ${CUDA_LIBRARIES}
     ${CUDA_NVRTC_LIB}
     ${CUDA_CUDA_LIB}
     ${TORCH_CUDA_LIBRARIES})
-
   target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA)
+
+  target_link_libraries(tutorial_tensorexpr PRIVATE
+    ${CUDA_LIBRARIES}
+    ${CUDA_NVRTC_LIB}
+    ${CUDA_CUDA_LIB}
+    ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
 elseif(USE_ROCM)
   target_link_libraries(test_tensorexpr PRIVATE
     ${ROCM_HIPRTC_LIB}
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
-
-  target_link_libraries(test_tensorexpr PRIVATE caffe2_gpu)
-
   target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
+
+  target_link_libraries(tutorial_tensorexpr PRIVATE
+    ${ROCM_HIPRTC_LIB}
+    ${PYTORCH_HIP_HCC_LIBRARIES}
+    ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
 endif()
 
 if(INSTALL_TEST)
   install(TARGETS test_tensorexpr DESTINATION bin)
+  install(TARGETS tutorial_tensorexpr DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
     install(FILES $<TARGET_PDB_FILE:test_tensorexpr> DESTINATION bin OPTIONAL)
+    install(FILES $<TARGET_PDB_FILE:tutorial_tensorexpr> DESTINATION bin OPTIONAL)
   endif()
 endif()
diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
index 3ccc484c8420..ca642d1db96e 100644
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ b/test/cpp/tensorexpr/test_aten.cpp
@@ -15,13 +15,13 @@ using namespace torch::jit::tensorexpr;
 void testATen_cast_Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Cast::make(kFloat, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -43,13 +43,13 @@ void testATen_cast_Float() {
 void testATennegInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Sub::make(0, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -71,13 +71,13 @@ void testATennegInt() {
 void testATennegFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Sub::make(0, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -99,16 +99,16 @@ void testATennegFloat() {
 void testATenaddInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -136,16 +136,16 @@ void testATenaddInt() {
 void testATenaddFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -173,16 +173,16 @@ void testATenaddFloat() {
 void testATensubInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -210,16 +210,16 @@ void testATensubInt() {
 void testATensubFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -247,17 +247,16 @@ void testATensubFloat() {
 void testATenlerp() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d =
-      Store::make(d_buf, {index}, load_a + load_c * (load_b - load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_c * (load_b - load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -285,19 +284,18 @@ void testATenlerp() {
 void testATenaddcmulInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
-  Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  ExprHandle load_d = Load::make(d_buf, {index}, 1);
-  Stmt* store_e =
-      Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_e);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -328,19 +326,18 @@ void testATenaddcmulInt() {
 void testATenaddcmulFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  ExprHandle load_d = Load::make(d_buf, {index}, 1);
-  Stmt* store_e =
-      Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_e);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -371,14 +368,14 @@ void testATenaddcmulFloat() {
 void testATenmulInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a * load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -403,14 +400,14 @@ void testATenmulInt() {
 void testATenmulFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a * load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -435,14 +432,14 @@ void testATenmulFloat() {
 void testATendivInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a / load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -467,14 +464,14 @@ void testATendivInt() {
 void testATendivFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a / load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -499,15 +496,14 @@ void testATendivFloat() {
 void testATenmaxInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -532,15 +528,14 @@ void testATenmaxInt() {
 void testATenmaxFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -565,15 +560,14 @@ void testATenmaxFloat() {
 void testATenminInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -598,15 +592,14 @@ void testATenminInt() {
 void testATenminFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -631,12 +624,12 @@ void testATenminFloat() {
 void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, FloatImm::make(1.0f) / load_a, 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, FloatImm::make(1.0f) / load_a);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -658,12 +651,12 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
 void testATenreluInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, Max::make(load_a, 0, false), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, Max::make(load_a, 0, false));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -685,16 +678,14 @@ void testATenreluInt() {
 void testATenreluFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(
-      b_buf,
-      {index},
-      Max::make(load_a, 0, false), // relu does not propagate nans
-      1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store(
+      {index}, Max::make(load_a, 0, false) // relu does not propagate nans
+  );
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -716,12 +707,12 @@ void testATenreluFloat() {
 void testATenlogFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -743,12 +734,12 @@ void testATenlogFloat() {
 void testATenlog10Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log10(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log10(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -770,12 +761,12 @@ void testATenlog10Float() {
 void testATenlog2Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log2(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log2(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -797,12 +788,12 @@ void testATenlog2Float() {
 void testATenexpFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, exp(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, exp(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -824,12 +815,12 @@ void testATenexpFloat() {
 void testATenerfFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, erf(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, erf(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -851,12 +842,12 @@ void testATenerfFloat() {
 void testATencosFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, cos(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, cos(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -878,27 +869,22 @@ void testATencosFloat() {
 void testATeneqInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -909,27 +895,22 @@ void testATeneqInt() {
 void testATengeInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -940,27 +921,22 @@ void testATengeInt() {
 void testATengtInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 6);
   std::vector<int> b_buffer(N, 3);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -971,27 +947,22 @@ void testATengtInt() {
 void testATenleInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -1002,27 +973,22 @@ void testATenleInt() {
 void testATenltInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index 98d3d4127da8..11c1c34f24a1 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -7,9 +7,7 @@
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
@@ -49,9 +47,9 @@ void testBoundsInference_1() {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -74,9 +72,9 @@ void testBoundsInference_2() {
   // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
   KernelScope kernel_scope;
   VarHandle n("n", kInt);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -99,9 +97,10 @@ void testBoundsInference_3() {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n + 10}, kFloat));
-  Tensor* b = Compute(
-      "b", {{n, "i"}}, [&](const VarHandle& i) { return a(i) * a(i + 10); });
+  Placeholder a(BufHandle("a", {n + 10}, kFloat));
+  Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
+    return a.load(i) * a.load(i + 10);
+  });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -128,14 +127,14 @@ void testBoundsInference_4() {
   KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
-  Buffer a(BufHandle("a", {H, W}, kFloat));
+  Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
   Tensor* c = Compute(
       "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a(y, x) * b->call(y, x);
+        return a.load(y, x) * b->call(y, x);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -207,9 +206,9 @@ void testBoundsInference_5() {
   //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   For* outer;
@@ -260,14 +259,14 @@ void testBoundsInference_6() {
   ExprHandle H(200);
   ExprHandle CW(32);
   ExprHandle CH(20);
-  Buffer a(BufHandle("a", {H, W}, kFloat));
+  Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
       });
   Tensor* c = Compute(
       "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a(y + 100, x + 100) * b->call(y * 2, x * 5);
+        return a.load(y + 100, x + 100) * b->call(y * 2, x * 5);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -328,11 +327,11 @@ void testBoundsInference_6() {
 void testBoundsInferenceNonOverlapping() {
   KernelScope kernel_scope;
   ExprHandle H(3);
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   Tensor* b =
-      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
+      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
   Tensor* c = Compute(
-      "c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H + 1); });
+      "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H + 1); });
   LoopNest l({b, c});
   std::vector<For*> loops = NodeFinder<For>::find(l.root_stmt());
 
@@ -389,11 +388,11 @@ void testBoundsInferenceNonOverlapping() {
 void testBoundsInferenceAdjacent() {
   KernelScope kernel_scope;
   ExprHandle H(6);
-  Buffer a(BufHandle("a", {20}, kFloat));
+  Placeholder a(BufHandle("a", {20}, kFloat));
   Tensor* b =
-      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
-  Tensor* c =
-      Compute("c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H); });
+      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor* c = Compute(
+      "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<For*> loops = NodeFinder<For>::find(l.root_stmt());
 
@@ -448,7 +447,7 @@ void testBoundsInferenceAdjacent() {
 
 void testMergeInferredBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // There are seven cases to consider in mergeTensorAccesses(A, B)
   //   * A is lower than B and does not overlap.
@@ -518,7 +517,7 @@ void testMergeInferredBounds() {
 
 void testMergeInferredLoadStoreDiff() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // Loads and Stores do not merge:
   BoundsInfo info;
@@ -549,7 +548,7 @@ void testMergeInferredLoadStoreDiff() {
 
 void testMergeInferred2DBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10, 10}, kFloat));
+  Placeholder a(BufHandle("a", {10, 10}, kFloat));
 
   // Non overlapping in both dimensions:
   BoundsInfo info;
@@ -607,7 +606,7 @@ void testMergeInferred2DBounds() {
 
 void testMergeAdjacentBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // Adjacent but not overlapping bounds can be merged.
   // e.g. {1-4} | {5-9} => {1-9}
@@ -647,7 +646,7 @@ std::pair<std::string, std::string> boundAsStringPair(
 
 void testMergeSymbolicBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   VarHandle W("W", kInt);
   VarHandle X("X", kInt);
   VarHandle Y("Y", kInt);
@@ -757,7 +756,7 @@ void testMergeSymbolicBounds() {
 
 void testMergeSymbolicAdjacent() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   VarHandle X("X", kInt);
   VarHandle Y("Y", kInt);
 
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index 2ad70e158ebf..6dba8c574c57 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -8,7 +8,6 @@
 
 #include <torch/csrc/jit/testing/file_check.h>
 #include "test/cpp/tensorexpr/padded_buffer.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/cuda_codegen.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
@@ -31,8 +30,8 @@ void testCudaTestVectorAdd01_impl() {
   const int block_count = 16;
   const int block_size = 128;
   Dtype dtype = ToDtype<ctype>();
-  Buffer a_buf("a", dtype, {num_iter, block_count, block_size});
-  Buffer b_buf("b", dtype, {num_iter, block_count, block_size});
+  Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
+  Placeholder b_buf("b", dtype, {num_iter, block_count, block_size});
   Tensor* c = Compute(
       "c",
       {
@@ -41,7 +40,7 @@ void testCudaTestVectorAdd01_impl() {
           {block_size, "t_id"},
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return a_buf(n, b_id, t_id) + b_buf(n, b_id, t_id);
+        return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -97,7 +96,7 @@ void testCudaSigmoid() {
   const int block_count = 16;
   const int block_size = 128;
   Dtype dtype = ToDtype<float>();
-  Buffer a_buf("a", dtype, {num_iter, block_count, block_size});
+  Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
   Tensor* c = Compute(
       "c",
       {
@@ -106,7 +105,7 @@ void testCudaSigmoid() {
           {block_size, "t_id"},
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return sigmoid(sigmoid(a_buf(n, b_id, t_id)));
+        return sigmoid(sigmoid(a_buf.load(n, b_id, t_id)));
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -162,14 +161,14 @@ void testCudaTestVectorAdd01() {
 
 static void testCudaTestVectorAdd02_impl(int N, int block_size) {
   KernelScope kernel_scope;
-  Buffer a_buf("a", kFloat, {N});
-  Buffer b_buf("b", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {N});
+  Placeholder b_buf("b", kFloat, {N});
   Tensor* c = Compute(
       "c",
       {
           {N, "N"},
       },
-      [&](const VarHandle& n) { return a_buf(n) + b_buf(n); });
+      [&](const VarHandle& n) { return a_buf.load(n) + b_buf.load(n); });
   LoopNest l({c});
   For* n_outer;
   For* n_inner;
@@ -224,9 +223,9 @@ void testCudaTestVectorAdd02() {
 void testCudaHalfCast() {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
-  Buffer a("a", half, {4});
+  Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, a(i));
+    return Cast::make(kFloat, a.load(i));
   });
 
   LoopNest l({b});
@@ -265,11 +264,11 @@ void testCudaDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     l.prepareForCodegen();
@@ -386,9 +385,9 @@ void testCudaDynamicShapeSplit() {
   KernelScope ks;
   constexpr int N = 4096;
   VarHandle n("n", kInt);
-  Buffer a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
-      Compute("b", {{n, "n"}}, [&](const VarHandle& i) { return a(i) * 2.0f; });
+  Placeholder a(BufHandle("a", {n}, kFloat));
+  Tensor* b = Compute(
+      "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
   LoopNest l({b});
   For* outer;
   For* inner;
@@ -436,8 +435,8 @@ void testCudaDynamicShapeSplit() {
 void testCudaOneBlockOneThreadGlobalReduce1() {
   const static int N = 1024;
   KernelScope kernel_scope;
-  Buffer data_buf("data", kFloat, {N});
-  Buffer output_buf("output", kFloat, {1});
+  Placeholder data_buf("data", kFloat, {N});
+  Placeholder output_buf("output", kFloat, {1});
 
   // The test adds the following code for trivial reduction:
   // for (int bidx = 0; bidx < 1; bidx++) { // blockIdx.x
@@ -449,12 +448,12 @@ void testCudaOneBlockOneThreadGlobalReduce1() {
   //   }
   // }
 
-  Store* init_store = Store::make(output_buf, {0}, 0.f, 1);
+  Store* init_store = output_buf.store({0}, 0.f);
   VarHandle i1("i1", kInt);
-  ExprHandle load_data = Load::make(data_buf, {i1}, 1);
-  ExprHandle load_output = Load::make(output_buf, {0}, 1);
+  ExprHandle load_data = Load::make(BufHandle(data_buf.data()), {i1}, 1);
+  ExprHandle load_output = Load::make(BufHandle(output_buf.data()), {0}, 1);
   ExprHandle add_value = load_output + load_data;
-  Store* store_output = Store::make(output_buf, {0}, add_value, 1);
+  Store* store_output = output_buf.store({0}, add_value);
   For* for_output = For::make(i1, 0, N, store_output);
   Stmt* reduce_block = Block::make({init_store, for_output});
   VarHandle thread_idx("tidx", kInt);
@@ -515,10 +514,10 @@ void testCudaOneBlockMultiThreadGlobalReduce1() {
   //      b[0] = b[0] + a[t] // implied atomic
   // clang-format on
 
-  Buffer a_buf("a", kFloat, {N});
-  Buffer b_buf("b", kFloat, {1});
+  Placeholder a_buf("a", kFloat, {N});
+  Placeholder b_buf("b", kFloat, {1});
 
-  Store* init_store = Store::make(b_buf, {0}, 0.f, 1);
+  Store* init_store = b_buf.store({0}, 0.f);
   VarHandle t("t", kInt);
   VarHandle b("b", kInt);
 
@@ -534,10 +533,10 @@ void testCudaOneBlockMultiThreadGlobalReduce1() {
 
   //  for t in 0..1024: // thread-idx
   //    b[0] = b[0] + a[t] // implied atomic
-  ExprHandle load_a = Load::make(a_buf, {t}, 1);
-  ExprHandle load_b = Load::make(b_buf, {0}, 1);
+  ExprHandle load_a = Load::make(BufHandle(a_buf.data()), {t}, 1);
+  ExprHandle load_b = Load::make(BufHandle(b_buf.data()), {0}, 1);
   ExprHandle add_value = load_b + load_a;
-  Store* store_b = Store::make(b_buf, {0}, add_value, 1);
+  Store* store_b = b_buf.store({0}, add_value);
   For* for_b = For::make(t, 0, N, store_b, thread_idx_options);
 
   Stmt* reduce_block = Block::make({for_init, for_b});
@@ -597,8 +596,8 @@ void testCudaNoThreadIdxWrite_1() {
   //  covered by its own thread-idx
 
   const static int N = 1024;
-  Buffer a_buf("a", kFloat, {2});
-  Buffer b_buf("b", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {2});
+  Placeholder b_buf("b", kFloat, {N});
 
   VarHandle k("k", kInt);
   VarHandle l("l", kInt);
@@ -608,15 +607,15 @@ void testCudaNoThreadIdxWrite_1() {
   //   a[0] = 0
   //   for n in 0..2:
   //     a[0] = a[0] + n
-  Store* store_a0_0 = Store::make(a_buf, {0}, 0.f, 1);
-  ExprHandle load_a0 = Load::make(a_buf, {0}, 1);
+  Store* store_a0_0 = a_buf.store({0}, 0.f);
+  ExprHandle load_a0 = Load::make(BufHandle(a_buf.data()), {0}, 1);
   ExprHandle v1 = load_a0 + n;
-  Store* store_a0_v1 = Store::make(a_buf, {0}, v1, 1);
+  Store* store_a0_v1 = a_buf.store({0}, v1);
   For* loop_a_0 = For::make(n, 0, 2, store_a0_v1);
 
   //   for m in 0..1024: // thread-idx
   //     b[m] = m
-  Store* store_bm_m = Store::make(b_buf, {m}, m + 0.f, 1);
+  Store* store_bm_m = b_buf.store({m}, m + 0.f);
   LoopOptions thread_idx_options;
   thread_idx_options.set_gpu_thread_index(0);
   For* loop_b_1 = For::make(m, 0, N, store_bm_m, thread_idx_options);
@@ -624,10 +623,10 @@ void testCudaNoThreadIdxWrite_1() {
   //   a[1] = 1
   //   for l in 0..2:
   //     a[1] = a[1] + l
-  Store* store_a1_1 = Store::make(a_buf, {1}, 1.f, 1);
-  ExprHandle load_a1 = Load::make(a_buf, {1}, 1);
+  Store* store_a1_1 = a_buf.store({1}, 1.f);
+  ExprHandle load_a1 = a_buf.load(1);
   ExprHandle v2 = load_a1 + l;
-  Store* store_a1_v2 = Store::make(a_buf, {1}, v2, 1);
+  Store* store_a1_v2 = a_buf.store({1}, v2);
   For* loop_a_1 = For::make(l, 0, 2, store_a1_v2);
 
   Stmt* reduce_block =
@@ -699,8 +698,8 @@ void testCudaSharedMemReduce_1() {
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
 
-  Buffer a("a", kFloat, {1, M, N});
-  Buffer b("b", kFloat, {1});
+  Placeholder a("a", kFloat, {1, M, N});
+  Placeholder b("b", kFloat, {1});
   VarHandle k("k", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
@@ -729,7 +728,8 @@ void testCudaSharedMemReduce_1() {
     //    for n in 0..64:  // thread_idx
     //      c(n) = c(n) + a(k, m, n)
     ExprHandle load_cn = Load::make(kFloat, c, {n}, 1);
-    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1);
+    ExprHandle a_kmn =
+        Load::make(BufHandle(a.data()), {k * (M * N) + m * N + n}, 1);
     ExprHandle v_add = load_cn + a_kmn;
     Store* store_cn_v = Store::make(c, {n}, v_add);
     For* loop_n2 = For::make(n, 0, N, store_cn_v, thread_idx_opt);
@@ -741,12 +741,12 @@ void testCudaSharedMemReduce_1() {
     //    b(k) = 0
     //    for n in 0..64:  // thread_idx
     //      b(k) = b(k) + c(n)
-    Store* store_bk_0 = Store::make(b, {k}, 0.f, 1);
+    Store* store_bk_0 = b.store({k}, 0.f);
     block.push_back(store_bk_0);
-    ExprHandle load_bk = Load::make(b, {k}, 1);
+    ExprHandle load_bk = b.load(k);
     ExprHandle load_cn = Load::make(kFloat, c, {n}, 1);
     ExprHandle v_add = load_bk + load_cn;
-    Store* store_bk = Store::make(b, {k}, v_add, 1);
+    Store* store_bk = b.store({k}, v_add);
     For* loop_n3 = For::make(n, 0, N, store_bk, thread_idx_opt);
     block.push_back(loop_n3);
   }
@@ -835,8 +835,8 @@ void testCudaLocalMemReduce_1() {
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
 
-  Buffer a("a", kFloat, {1, M, N});
-  Buffer b("b", kFloat, {1});
+  Placeholder a("a", kFloat, {1, M, N});
+  Placeholder b("b", kFloat, {1});
   VarHandle k("k", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
@@ -848,7 +848,7 @@ void testCudaLocalMemReduce_1() {
   std::vector<Stmt*> block_k;
   {
     //    b(k) = 0
-    Store* store_bk_0 = Store::make(b, {k}, 0.f, 1);
+    Store* store_bk_0 = b.store({k}, 0.f);
     block_k.push_back(store_bk_0);
   }
   std::vector<Stmt*> block_n;
@@ -866,7 +866,7 @@ void testCudaLocalMemReduce_1() {
     //      for m in 0..128:
     //        c(0) = c(0) + a(k, m, n)
     ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1);
-    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1);
+    ExprHandle a_kmn = a.load(k * (M * N) + m * N + n);
     ExprHandle v_add = load_c0 + a_kmn;
     Store* store_c0_v = Store::make(c, {0}, v_add);
     For* loop_m = For::make(m, 0, M, store_c0_v);
@@ -874,10 +874,10 @@ void testCudaLocalMemReduce_1() {
   }
   {
     //      b(k) = b(k) + c(0)
-    ExprHandle load_bk = Load::make(b, {k}, 1);
+    ExprHandle load_bk = b.load(k);
     ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1);
     ExprHandle v_add = load_bk + load_c0;
-    Store* store_bk = Store::make(b, {k}, v_add, 1);
+    Store* store_bk = b.store({k}, v_add);
     block_n.push_back(store_bk);
   }
   {
@@ -930,9 +930,9 @@ void testCudaLocalMemReduce_1() {
 void testCudaHalfSupport() {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
-  Buffer a("a", half, {4});
+  Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(half, ExprHandle(2.0f) * a(i));
+    return Cast::make(half, ExprHandle(2.0f) * a.load(i));
   });
 
   Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
@@ -985,11 +985,60 @@ void testCudaHalfSupport() {
   cudaFree(dDev);
 }
 
+void testCudaHalfPropagation() {
+  KernelScope kernel_scope;
+  auto half = ToDtype<at::Half>();
+  Placeholder a("a", half, {4});
+  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+    return Max::make(a.load(i), ExprHandle(new HalfImm(0)), true);
+  });
+
+  LoopNest l({relu});
+  l.prepareForCodegen();
+  Stmt* s = l.root_stmt();
+  CudaCodeGen cg(s, {a, relu});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+
+  // Check the types used by the Max are Float.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK:  float v = float(a[n]);
+# CHECK:  relu[n] = half(Max(v, 0.f
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<at::Half> aData(4, 2.0f);
+  std::vector<at::Half> reluData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  at::Half* reluDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto reluSize = reluData.size() * sizeof(reluData[0]);
+
+  cudaMalloc(&aDev, aSize);
+  cudaMalloc(&reluDev, reluSize);
+  cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice);
+  cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+
+  cg.call({aDev, reluDev});
+  cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+
+  assertAllEqual(aData, reluData);
+
+  cudaFree(aDev);
+  cudaFree(reluDev);
+}
+
 void testCudaPrioritizeDependents() {
   KernelScope kernel_scope;
-  Buffer a("a", kFloat, {10});
-  Buffer b("b", kFloat, {12});
-  Buffer c("c", kFloat, {12});
+  Placeholder a("a", kFloat, {10});
+  Placeholder b("b", kFloat, {12});
+  Placeholder c("c", kFloat, {12});
 
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
@@ -1002,13 +1051,13 @@ void testCudaPrioritizeDependents() {
    *   c[i] = (i < 10 ? a[i] + b[i] : b[i]);
    * }
    */
-  ExprHandle load_a = Load::make(a, {i}, 1);
-  ExprHandle load_b = Load::make(b, {i}, 1);
+  ExprHandle load_a = Load::make(BufHandle(a.data()), {i}, 1);
+  ExprHandle load_b = Load::make(BufHandle(b.data()), {i}, 1);
   ExprHandle cmp = CompareSelect::make(i, 10, CompareSelectOperation::kLT);
   ExprHandle ite = IfThenElse::make(cmp, Add::make(load_a, load_b), load_b);
 
-  For* loop = For::make(
-      i, 0, 12, Block::make({Store::make(c, {i}, ite, 1)}), block_idx_opt);
+  For* loop =
+      For::make(i, 0, 12, Block::make({c.store({i}, ite)}), block_idx_opt);
 
   CudaCodeGen cuda_cg(loop, a, b, c);
 
@@ -1063,12 +1112,13 @@ void testCudaMaskBlockDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1155,12 +1205,13 @@ void testCudaMaskThreadDim() {
   KernelScope kernel_scope;
   int A_SIZE = 50;
   int B_SIZE = 100;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i / 2) + b_buf(i);
+    return a_buf.load(i / 2) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1249,12 +1300,13 @@ void testCudaMaskMultiBlockDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1342,12 +1394,13 @@ void testCudaMaskBlockAndThreadDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1434,19 +1487,19 @@ void testCudaMaskMultiDim() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1564,19 +1617,19 @@ void testCudaMaskMultiDimSymbolic() {
   VarHandle OUTER_SIZE("OUTER_SIZE", kInt);
   VarHandle A_SIZE("A_SIZE", kInt);
   VarHandle B_SIZE("B_SIZE", kInt);
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1700,10 +1753,10 @@ void testCudaMaskCompoundInnerLoop() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
 
   // Can't build this using Compute and transforms yet.
   LoopOptions blockBound;
@@ -1723,13 +1776,13 @@ void testCudaMaskCompoundInnerLoop() {
                j,
                0,
                A_SIZE,
-               Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1),
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
                threadBound),
            For::make(
                k,
                0,
                B_SIZE,
-               Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1),
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
                threadBound)}),
       blockBound);
 
@@ -1839,10 +1892,10 @@ void testCudaMaskInnerLoopOneBlock() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
 
   // Can't build this using Compute and transforms yet.
   LoopOptions blockBound;
@@ -1862,13 +1915,13 @@ void testCudaMaskInnerLoopOneBlock() {
                j,
                0,
                A_SIZE,
-               Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1),
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
                threadBound),
            For::make(
                k,
                0,
                B_SIZE,
-               Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1),
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
                threadBound)}));
 
   stmt = FlattenIndexes(stmt);
@@ -1978,19 +2031,19 @@ void testCudaMaskMultiDimMultiAxis() {
   int OUTER_SIZE = 10;
   int A_SIZE = 30;
   int B_SIZE = 15;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2109,19 +2162,19 @@ void testCudaMaskMultiDimMultiLevel() {
   int OUTER_B_SIZE = 5;
   int A_SIZE = 30;
   int B_SIZE = 15;
-  Buffer a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index c1386a85764b..6fafb2813902 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -2,9 +2,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
@@ -64,13 +62,13 @@ void testExprLetTest02() {
 
 void testExprLetStmtTest01() {
   KernelScope kernel_scope;
-  Buffer a_buf("a", kFloat, {1});
-  Buffer b_buf("b", kFloat, {1});
+  Placeholder a_buf("a", kFloat, {1});
+  Placeholder b_buf("b", kFloat, {1});
 
-  ExprHandle load_a = Load::make(a_buf, {0}, 1);
+  ExprHandle load_a = a_buf.load(0);
   VarHandle var = VarHandle("v", kFloat);
   Stmt* let_store = Let::make(var, load_a);
-  Stmt* store_b = Store::make(b_buf, {0}, var, 1);
+  Stmt* store_b = b_buf.store({0}, var);
   Block* block = Block::make({let_store, store_b});
 
   SimpleIREvaluator eval(block, a_buf, b_buf);
@@ -164,15 +162,33 @@ void testExprDoubleTest() {
   ASSERT_EQ(eval.value<double>(), 2 + (3 * 3 + 4));
 }
 
+void testExprDisallowBoolArithmetic() {
+  KernelScope kernel_scope;
+  VarHandle x("x", kBool);
+  VarHandle y("y", kBool);
+  std::string error{"arithmetic binary operations on Bool not supported"};
+  ASSERT_THROWS_WITH((x + y), error);
+  ASSERT_THROWS_WITH((x - y), error);
+  ASSERT_THROWS_WITH((x * y), error);
+  ASSERT_THROWS_WITH((x / y), error);
+  ASSERT_THROWS_WITH((x & y), error);
+  ASSERT_THROWS_WITH((x | y), error);
+  ASSERT_THROWS_WITH((x ^ y), error);
+  ASSERT_THROWS_WITH((x << y), error);
+  ASSERT_THROWS_WITH((x >> y), error);
+  ASSERT_THROWS_WITH(Max::make(x, y, /*propagate_nans=*/true), error);
+  ASSERT_THROWS_WITH(Min::make(x, y, /*propagate_nans=*/true), error);
+}
+
 void testExprVectorAdd01() {
   KernelScope kernel_scope;
   const int kVectorSize = 8;
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   /*
   Build the following:
@@ -183,17 +199,14 @@ void testExprVectorAdd01() {
     }
   */
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(
-      a_buf,
+  ExprHandle load_a = a_buf.loadWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       Broadcast::make(1, kVectorSize));
-  ExprHandle load_b = Load::make(
-      b_buf,
+  ExprHandle load_b = b_buf.loadWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       Broadcast::make(1, kVectorSize));
   ExprHandle value = load_a + load_b;
-  Stmt* store_c = Store::make(
-      c_buf,
+  Stmt* store_c = c_buf.storeWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       value,
       Broadcast::make(1, kVectorSize));
@@ -220,28 +233,23 @@ void testExprVectorAdd01() {
 void testExprCompareSelectEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -264,15 +272,14 @@ void testExprCompareSelectDtypes() {
   //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 0.0f);
   std::vector<float> c_ref(N, 3.14f);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f
   // A and B are int, C is float.
@@ -280,16 +287,14 @@ void testExprCompareSelectDtypes() {
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
+              a.load(i),
+              b.load(i),
               FloatImm::make(3.14f),
               FloatImm::make(2.78f),
-              CompareSelectOperation::kEQ),
-          mask));
+              CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(select_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -306,16 +311,14 @@ void testExprCompareSelectDtypes() {
 void testExprIntrinsicsDtypes() {
   KernelScope kernel_scope;
   constexpr int N = 256;
-  Buffer a(BufHandle("A", {N}, kDouble));
-  Buffer b(BufHandle("B", {N}, kDouble));
+  Placeholder a(BufHandle("A", {N}, kDouble));
+  Placeholder b(BufHandle("B", {N}, kDouble));
   std::vector<double> a_buffer(N, -10.0);
   std::vector<double> b_buffer(N, 0.0);
   std::vector<double> b_ref(N, 10.0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto fabs_expr = For::make(
-      i, 0, N, Store::make(b, {i}, fabs(Load::make(a, {i}, mask)), mask));
+  auto fabs_expr = For::make(i, 0, N, b.store({i}, fabs(a.load(i))));
 
   SimpleIREvaluator ir_eval(fabs_expr, a, b);
   ir_eval(a_buffer, b_buffer);
@@ -463,11 +466,11 @@ void testExprDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -483,12 +486,10 @@ void testCond01() {
   KernelScope kernel_scope;
   const int N = 16;
   PaddedBuffer<float> a_v(N);
-  Buffer a_buf("a", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {N});
   VarHandle index = VarHandle("index", kInt);
-  Stmt* assign_x2 =
-      Store::make(BufHandle(a_buf.data()), {index}, cast<float>(index) * 2, 1);
-  Stmt* assign_x3 =
-      Store::make(BufHandle(a_buf.data()), {index}, cast<float>(index) * 3, 1);
+  Stmt* assign_x2 = a_buf.store({index}, cast<float>(index) * 2);
+  Stmt* assign_x3 = a_buf.store({index}, cast<float>(index) * 3);
   ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ);
   Stmt* assign = Cond::make(even_cond, assign_x2, assign_x3);
   Stmt* for_stmt = For::make(index, 0, N, assign);
@@ -546,9 +547,9 @@ void testStmtClone() {
   KernelScope kernel_scope;
   const int N = 16;
 
-  Buffer a_buf("a", kInt, {N});
+  Placeholder a_buf("a", kInt, {N});
   VarHandle index = VarHandle("index", kInt);
-  Stmt* body = Store::make(BufHandle(a_buf.data()), {index}, 5, 1);
+  Stmt* body = a_buf.store({index}, 5);
   Stmt* loop = For::make(index, 0, N, body);
 
   Stmt* cloned_loop = Stmt::clone(loop);
@@ -562,7 +563,7 @@ void testStmtClone() {
 
   // Let's add another assign to the body in the cloned loop and verify that the
   // original statement hasn't changed while the cloned one has.
-  Stmt* body_addition = Store::make(BufHandle(a_buf.data()), {index}, 33, 1);
+  Stmt* body_addition = a_buf.store({index}, 33);
   Block* cloned_body =
       static_cast<Block*>(static_cast<const For*>(cloned_loop)->body());
   cloned_body->append_stmt(body_addition);
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index ab916d370e82..d80710fa732b 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -2,7 +2,6 @@
 #include <torch/csrc/jit/frontend/code_template.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index ee8540eb58c4..7f4e1a0afc24 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -3,9 +3,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
@@ -163,17 +161,13 @@ void testLLVMByteToDoubleCastTest() {
 void testLLVMLetTest01() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kFloat));
+  Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kFloat);
   auto block = Block::make({
       Let::make(x, 3.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)),
-          IntImm::make(1)),
+      a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))),
   });
 
   LLVMCodeGen cg(block, {a});
@@ -184,20 +178,17 @@ void testLLVMLetTest01() {
 void testLLVMLetTest02() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kFloat));
+  Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kFloat);
   VarHandle y("y", kFloat);
-  auto block = Block::make({
-      Let::make(x, 3.f),
-      Let::make(y, 6.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)),
-          IntImm::make(1)),
-  });
+  auto block = Block::make(
+      {Let::make(x, 3.f),
+       Let::make(y, 6.f),
+       a.store(
+           {IntImm::make(0)},
+           ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))});
 
   LLVMCodeGen cg(block, {a});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -207,22 +198,20 @@ void testLLVMLetTest02() {
 void testLLVMLetTestMultitype() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kDouble));
+  Placeholder a(BufHandle("A", {1}, kDouble));
   std::vector<double> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kByte);
   VarHandle y("y", kHalf);
-  auto block = Block::make({
-      Let::make(x, 3),
-      Let::make(y, 6.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          Cast::make(
-              kDouble,
-              ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f))),
-          IntImm::make(1)),
-  });
+  auto block =
+      Block::make({Let::make(x, 3),
+                   Let::make(y, 6.f),
+                   a.store(
+                       {0},
+                       Cast::make(
+                           kDouble,
+                           ExprHandle(2.f) +
+                               (x * ExprHandle(3.f) + y * ExprHandle(4.f))))});
 
   LLVMCodeGen cg(block, {a});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -231,7 +220,7 @@ void testLLVMLetTestMultitype() {
 
 void testLLVMBufferTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {32}, kFloat));
+  Placeholder a(BufHandle("A", {32}, kFloat));
   std::vector<int32_t> v(5);
   std::vector<void*> args({v.data()});
   auto rv = IntImm::make(0);
@@ -241,14 +230,14 @@ void testLLVMBufferTest() {
 
 void testLLVMBlockTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {32}, kInt));
+  Placeholder a(BufHandle("A", {32}, kInt));
   std::vector<int32_t> v = {1, 2};
   std::vector<void*> args({v.data()});
 
   auto block = Block::make({
-      Store::make(a, {IntImm::make(0)}, IntImm::make(3), IntImm::make(1)),
-      Store::make(a, {IntImm::make(1)}, IntImm::make(4), IntImm::make(1)),
-      Store::make(a, {IntImm::make(0)}, IntImm::make(4), IntImm::make(1)),
+      a.store({0}, 3),
+      a.store({1}, 4),
+      a.store({0}, 4),
   });
 
   LLVMCodeGen cg(block, {a});
@@ -259,16 +248,12 @@ void testLLVMBlockTest() {
 
 void testLLVMLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {42};
   std::vector<int32_t> b_buffer = {-11};
 
-  auto store = Store::make(
-      b,
-      {IntImm::make(0)},
-      Load::make(a, {IntImm::make(0)}, IntImm::make(1)),
-      IntImm::make(1));
+  auto store = b.store({0}, a.load(0));
   LLVMCodeGen cg(store, {a, b});
   std::vector<void*> args({a_buffer.data(), b_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -278,21 +263,14 @@ void testLLVMLoadStoreTest() {
 
 void testLLVMIfThenElseTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
+  Placeholder c(BufHandle("C", {1}, kInt));
   std::vector<int32_t> a_buffer = {42};
   std::vector<int32_t> b_buffer = {-11};
   std::vector<int32_t> c_buffer = {1};
 
-  auto store = Store::make(
-      b,
-      {IntImm::make(0)},
-      IfThenElse::make(
-          Load::make(c, {IntImm::make(0)}, IntImm::make(1)), // cond
-          Load::make(a, {IntImm::make(0)}, IntImm::make(1)), // then
-          IntImm::make(0)), // else
-      IntImm::make(1));
+  auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0));
   LLVMCodeGen cg(store, {a, b, c});
   std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -302,15 +280,15 @@ void testLLVMIfThenElseTest() {
 
 void testLLVMVecLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {1, 1, 1, 1};
   std::vector<int32_t> b_buffer = {2, 2, 2, 2};
 
-  auto store = Store::make(
-      b,
+  auto store = b.storeWithMask(
       {Ramp::make(0, 1, 4)},
-      Load::make(a, {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)),
+      a.loadWithMask(
+          {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)),
       Broadcast::make(IntImm::make(1), 4));
   LLVMCodeGen cg(store, {a, b});
   std::vector<void*> args({a_buffer.data(), b_buffer.data()});
@@ -328,16 +306,14 @@ void testLLVMVecLoadStoreTest() {
 #define FLOAT_INTRINSICS_TEST(Name, Lanes)                       \
   void testLLVMVecFloat_##Name##Lane##Lanes##Test() {            \
     KernelScope kernel_scope;                                    \
-    Buffer a(BufHandle("A", {1}, kFloat));                       \
-    Buffer b(BufHandle("B", {1}, kFloat));                       \
+    Placeholder a(BufHandle("A", {1}, kFloat));                  \
+    Placeholder b(BufHandle("B", {1}, kFloat));                  \
     float val = 0.5f;                                            \
     std::vector<float> a_buffer(Lanes, val);                     \
     std::vector<float> b_buffer(Lanes, val);                     \
-    auto store = Store::make(                                    \
-        b,                                                       \
+    auto store = b.storeWithMask(                                \
         {Ramp::make(0, 1, Lanes)},                               \
-        Name(Load::make(                                         \
-            a,                                                   \
+        Name(a.loadWithMask(                                     \
             {Ramp::make(0, 1, Lanes)},                           \
             Broadcast::make(IntImm::make(1), Lanes))),           \
         Broadcast::make(IntImm::make(1), Lanes));                \
@@ -373,16 +349,14 @@ FLOAT_INTRINSICS_TEST(lgamma, 8)
 #define DOUBLE_INTRINSICS_TEST(Name, Lanes)                      \
   void testLLVMVecDouble_##Name##Lane##Lanes##Test() {           \
     KernelScope kernel_scope;                                    \
-    Buffer a(BufHandle("A", {1}, kDouble));                      \
-    Buffer b(BufHandle("B", {1}, kDouble));                      \
+    Placeholder a(BufHandle("A", {1}, kDouble));                 \
+    Placeholder b(BufHandle("B", {1}, kDouble));                 \
     float val = 0.5f;                                            \
     std::vector<double> a_buffer(Lanes, val);                    \
     std::vector<double> b_buffer(Lanes, val);                    \
-    auto store = Store::make(                                    \
-        b,                                                       \
+    auto store = b.storeWithMask(                                \
         {Ramp::make(0, 1, Lanes)},                               \
-        Name(Load::make(                                         \
-            a,                                                   \
+        Name(a.loadWithMask(                                     \
             {Ramp::make(0, 1, Lanes)},                           \
             Broadcast::make(IntImm::make(1), Lanes))),           \
         Broadcast::make(IntImm::make(1), Lanes));                \
@@ -417,13 +391,12 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4)
 
 void testLLVMVectorizerLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
 
-  Tensor* c = Compute("c", {{4, "i"}}, [&](const VarHandle& i) {
-    return Load::make(a, {i}, 1);
-  });
+  Tensor* c =
+      Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
   l.vectorize(dynamic_cast<Block*>(s)->front());
@@ -442,15 +415,13 @@ void testLLVMVectorizerLoadStoreTest() {
 void testLLVMMemcpyTest() {
   KernelScope kernel_scope;
   constexpr int N = 32;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 42);
   std::vector<int32_t> b_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, Store::make(b, {i}, Load::make(a, {i}, mask), mask));
+  auto expr = For::make(i, 0, N, b.store({i}, a.load(i)));
 
   LLVMCodeGen cg(expr, {a, b});
 
@@ -466,12 +437,11 @@ void testLLVMMemcpyTest() {
 void testLLVMBzeroTest() {
   KernelScope kernel_scope;
   constexpr int N = 32;
-  Buffer b(BufHandle("B", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> b_buffer(N, 11);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, Store::make(b, {i}, IntImm::make(0), mask));
+  auto expr = For::make(i, 0, N, b.store({i}, 0));
 
   LLVMCodeGen cg(expr, {b});
 
@@ -485,24 +455,15 @@ void testLLVMBzeroTest() {
 void testLLVMElemwiseAdd() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 41);
   std::vector<int32_t> b_buffer(N, 1);
   std::vector<int32_t> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Add::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)),
-          mask));
+  auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -520,21 +481,15 @@ void testLLVMElemwiseAdd() {
 void testLLVMElemwiseAddFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c, {i}, Load::make(a, {i}, mask) + Load::make(b, {i}, mask), mask));
+  auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -552,8 +507,8 @@ void testLLVMElemwiseAddFloat() {
 void testLLVMElemwiseLog10Float() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
   std::vector<float> a_buffer(N, 10.0f);
   std::vector<float> b_buffer(N, 2.0f);
 
@@ -563,10 +518,9 @@ void testLLVMElemwiseLog10Float() {
       i,
       0,
       N / 4,
-      Store::make(
-          b,
+      b.storeWithMask(
           {Ramp::make(i * 4, 1, 4)},
-          log10(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)),
+          log10(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)),
           mask));
 
   LLVMCodeGen cg(expr, {a, b});
@@ -583,8 +537,8 @@ void testLLVMElemwiseLog10Float() {
 void testLLVMElemwiseLog1pFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
   std::vector<float> a_buffer(N, expf(3.0f) - 1);
   std::vector<float> b_buffer(N, 42.0f);
 
@@ -594,10 +548,9 @@ void testLLVMElemwiseLog1pFloat() {
       i,
       0,
       N / 4,
-      Store::make(
-          b,
+      b.storeWithMask(
           {Ramp::make(i * 4, 1, 4)},
-          log1p(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)),
+          log1p(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)),
           mask));
 
   LLVMCodeGen cg(expr, {a, b});
@@ -614,24 +567,16 @@ void testLLVMElemwiseLog1pFloat() {
 void testLLVMElemwiseMaxInt() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 41);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -649,24 +594,16 @@ void testLLVMElemwiseMaxInt() {
 void testLLVMElemwiseMinInt() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 41);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -684,24 +621,16 @@ void testLLVMElemwiseMinInt() {
 void testLLVMElemwiseMaxFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -719,24 +648,16 @@ void testLLVMElemwiseMaxFloat() {
 void testLLVMElemwiseMaxNaNFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, NAN);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -755,24 +676,16 @@ void testLLVMElemwiseMaxNaNFloat() {
 void testLLVMElemwiseMinFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -790,24 +703,16 @@ void testLLVMElemwiseMinFloat() {
 void testLLVMElemwiseMinNaNFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, NAN);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -826,24 +731,15 @@ void testLLVMElemwiseMinNaNFloat() {
 void testLLVMElemwiseMod() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 41);
   std::vector<int32_t> b_buffer(N, 23);
   std::vector<int32_t> c_buffer(N, 18);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Mod::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)),
-          mask));
+  auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i))));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -861,9 +757,9 @@ void testLLVMElemwiseMod() {
 void testLLVMCompareSelectIntEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
@@ -874,20 +770,15 @@ void testLLVMCompareSelectIntEQ() {
     c_ref[i] = 0;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -907,27 +798,22 @@ void testLLVMCompareSelectIntEQ() {
 void testLLVMCompareSelectFloatEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<float> a_buffer(N, 1.0f);
   std::vector<float> b_buffer(N, 1.0f);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -946,9 +832,9 @@ void testLLVMCompareSelectFloatEQ() {
 void testLLVMCompareSelectByteGT() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 0);
   std::vector<int> c_buffer(N, 0);
@@ -959,20 +845,15 @@ void testLLVMCompareSelectByteGT() {
     c_ref[i] = 1;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -992,28 +873,23 @@ void testLLVMCompareSelectByteGT() {
 void testLLVMCompareSelectByteGE() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 0);
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1033,9 +909,9 @@ void testLLVMCompareSelectByteGE() {
 void testLLVMCompareSelectByteLT() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 128);
   std::vector<int> c_buffer(N, 0);
@@ -1046,20 +922,15 @@ void testLLVMCompareSelectByteLT() {
     c_ref[i] = 0;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1079,28 +950,23 @@ void testLLVMCompareSelectByteLT() {
 void testLLVMCompareSelectByteLE() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 128);
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1119,10 +985,9 @@ void testLLVMCompareSelectByteLE() {
 
 void testLLVMStoreFloat() {
   KernelScope kernel_scope;
-  Buffer result(BufHandle("result", {1}, kFloat));
+  Placeholder result(BufHandle("result", {1}, kFloat));
   std::vector<float> result_buffer = {0.0f};
-  auto expr = Store::make(
-      result, {IntImm::make(0)}, FloatImm::make(3.14f), IntImm::make(1));
+  auto expr = result.store({0}, FloatImm::make(3.14f));
   LLVMCodeGen cg(expr, {result});
   std::vector<void*> args({result_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -1137,7 +1002,7 @@ void testLLVMSimpleMath01() {
   });
   LoopNest l({tensor});
   Stmt* stmt = l.root_stmt();
-  Buffer f_buf(BufHandle(tensor->func_var()));
+  Placeholder f_buf(BufHandle(tensor->buf()));
   LLVMCodeGen cg(stmt, {f_buf});
 
   PaddedBuffer<float> f_v(N, "f_v");
@@ -1154,13 +1019,13 @@ void testLLVMSimpleMath01() {
 void testLLVMComputeMul() {
   KernelScope kernel_scope;
   const int N = 1024;
-  Buffer a(BufHandle("a", {N}, kFloat));
-  Buffer b(BufHandle("b", {N}, kFloat));
+  Placeholder a(BufHandle("a", {N}, kFloat));
+  Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
-    return Load::make(a, {i}, 1) * Load::make(b, {i}, 1);
+    return a.load(i) * b.load(i);
   });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
 
@@ -1178,15 +1043,14 @@ void testLLVMBroadcastAdd() {
   KernelScope kernel_scope;
   const int M = 32;
   const int N = 1024;
-  Buffer a(BufHandle("a", {M, N}, kFloat));
-  Buffer b(BufHandle("b", {N}, kFloat));
+  Placeholder a(BufHandle("a", {M, N}, kFloat));
+  Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        ExprHandle mask(1);
-        return Load::make(a, {i, j}, mask) + Load::make(b, {j}, mask);
+        return a.load(i, j) + b.load(j);
       });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   l.prepareForCodegen();
   Stmt* s = l.root_stmt();
@@ -1225,11 +1089,11 @@ void testLLVMDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -1247,11 +1111,11 @@ void testLLVMBindDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -1268,10 +1132,11 @@ void testLLVMTensorDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Tensor* c = Compute(
-        "c", {{n, "n"}}, [&](const VarHandle& i) { return a(i) + b(i); });
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
+      return a.load(i) + b.load(i);
+    });
     LoopNest l({c});
     Stmt* s = l.root_stmt();
     LLVMCodeGen cg(s, {a, b, c, n});
@@ -1291,11 +1156,11 @@ void testLLVMDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     l.prepareForCodegen();
@@ -1323,7 +1188,7 @@ void testLLVMEmptyStmt() {
 
 void testLLVMEliminatedStmt() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {1}, kFloat));
+  Placeholder a(BufHandle("a", {1}, kFloat));
 
   Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
 
@@ -1344,7 +1209,7 @@ void testLLVMSimpleReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
@@ -1383,7 +1248,7 @@ void testLLVMRFactorReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
@@ -1433,7 +1298,7 @@ void DISABLED_testLLVMRFactorVectorizedReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 602eb116e7b9..60c8fb1d62c0 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -7,9 +7,7 @@
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
@@ -577,11 +575,11 @@ void testExprSplitWithMask01() {
   KernelScope kernel_scope;
   const int M = 26;
   const int N = 5;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {M, N});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {M, N});
   Tensor* tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return a_buf(m, n) + b_buf(m, n) + 1.0f;
+        return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
       });
   For* n_outer;
   For* n_inner;
@@ -609,13 +607,47 @@ void testExprSplitWithMask01() {
   ExpectAllNear(c_v, c_ref, 1e-5);
 }
 
+// Tests the case where we split a loop cleanly multiple times, we should not
+// insert any masks.
+void testExprSplitWithMaskRepeatedNoMask() {
+  KernelScope kernel_scope;
+  const int M = 64;
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
+  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
+  });
+
+  LoopNest l({tensor});
+  std::vector<For*> loops = l.getLoopStmtsFor(tensor);
+  For *outer, *mid, *inner;
+  l.splitWithMask(loops[0], 4, &outer, &inner);
+  l.splitWithMask(outer, 4, &outer, &mid);
+
+  Stmt* stmt1 = IRSimplifier::simplify(l.root_stmt());
+  std::ostringstream oss;
+  oss << *stmt1;
+
+  // Two splits mean 3 loops, but should need no masks in this case.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK-NOT: if (
+# CHECK:   for (
+# CHECK-NOT: if (
+# CHECK:     for (
+# CHECK-NOT: if (
+# CHECK:       f[)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
 void testSplitWithTailWithLoopOptions() {
   KernelScope kernel_scope;
   const int M = 21;
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
-    return a_buf(m) + b_buf(m) + 1.0f;
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   For *outer, *inner, *tail;
 
@@ -642,10 +674,10 @@ void testSplitWithTailWithLoopOptions() {
 void testSplitWithMaskWithLoopOptions() {
   KernelScope kernel_scope;
   const int M = 21;
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
-    return a_buf(m) + b_buf(m) + 1.0f;
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   For *outer, *inner;
 
@@ -667,13 +699,13 @@ void testScheduleBroadcastAddBuffer() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
   Tensor* c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) + b_buf(n, k);
+        return a_buf.load(m, n) + b_buf.load(n, k);
       });
   LoopNest l({c});
   Stmt* stmt = l.root_stmt();
@@ -716,13 +748,13 @@ void testScheduleFunctionCall01() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
   Tensor* c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) + b_buf(n, k);
+        return a_buf.load(m, n) + b_buf.load(n, k);
       });
   Tensor* d = Compute(
       "d",
@@ -773,22 +805,22 @@ void testScheduleInlineSimple() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
 
   LoopNest l1({y});
@@ -854,22 +886,22 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
   Tensor* z = Compute(
       "z",
@@ -942,8 +974,9 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
         "z",
         {{M, "m3"}, {N, "n3"}, {K, "k3"}},
         [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-          return a_buf(m, n) * b_buf(n, k) +
-              (c_buf(m, n) * d_buf(m, k) + a_buf(m, n) * b_buf(n, k));
+          return a_buf.load(m, n) * b_buf.load(n, k) +
+              (c_buf.load(m, n) * d_buf.load(m, k) +
+               a_buf.load(m, n) * b_buf.load(n, k));
         });
     LoopNest l2({z2});
     l2.prepareForCodegen();
@@ -1090,14 +1123,14 @@ void testScheduleInlineIntrinsics() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
@@ -1455,11 +1488,11 @@ void testScheduleFuserStyle() {
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
 
   Tensor* b = Compute(
       "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
-        return a_buf(axes[0]) + 11.0f;
+        return a_buf.load(axes[0]) + 11.0f;
       });
 
   Tensor* c = Compute(
@@ -1488,19 +1521,19 @@ void testScheduleFuserThreeArg() {
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return a(i) + b(i);
+    return a.load(i) + b.load(i);
   });
   Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return (*e)(i) + c(i);
+    return e->call(i) + c.load(i);
   });
   Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return (*f)(i) + d(i);
+    return f->call(i) + d.load(i);
   });
 
   LoopNest l({g});
@@ -1526,11 +1559,11 @@ void testScheduleDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     Stmt* s = l.root_stmt();
@@ -2050,16 +2083,19 @@ void testLoopNestReorderExtraStatements() {
       });
   LoopNest l({tensor});
 
-  Buffer extra(BufHandle("res", {6, 3}, kFloat));
+  Placeholder extra(BufHandle("res", {6, 3}, kFloat));
 
   auto loops = l.getLoopStmtsFor(tensor);
 
   VarHandle i = VarHandle(loops[0]->var());
 
-  Stmt* store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f), 1);
-  Stmt* store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f), 1);
+  Stmt* store_1 =
+      Store::make(BufHandle(extra.data()), {i, 0}, ExprHandle(1.f), 1);
+  Stmt* store_2 =
+      Store::make(BufHandle(extra.data()), {i, 1}, ExprHandle(2.f), 1);
   // stmt 3 is the Function body.
-  Stmt* store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f), 1);
+  Stmt* store_3 =
+      Store::make(BufHandle(extra.data()), {i, 2}, ExprHandle(4.f), 1);
 
   loops[0]->body()->prepend_stmt(store_1);
   loops[1]->body()->prepend_stmt(store_2);
@@ -2190,16 +2226,16 @@ void LoopNestReorderTestHelper(
       [](const std::vector<VarHandle>&) { return -1; });
   LoopNest l({c});
 
-  Buffer extra(BufHandle("extra", {5}, kInt));
+  Placeholder extra(BufHandle("extra", {5}, kInt));
 
   auto loops = l.getLoopStmtsFor(c);
   int j = 0;
   for (auto* l : loops) {
     // Add an increment at each layer of the loop which counts the number of
     // times the loop executes.
-    Load* load = new Load(extra, {new IntImm(j)}, new IntImm(1));
+    Load* load = new Load(extra.data(), {new IntImm(j)}, new IntImm(1));
     Add* add = new Add(load, new IntImm(1));
-    Stmt* store = Store::make(extra, {j}, ExprHandle(add), 1);
+    Stmt* store = new Store(extra.data(), {new IntImm(j)}, add, new IntImm(1));
     if (prepend) {
       l->body()->prepend_stmt(store);
     }
@@ -2301,22 +2337,22 @@ void testLoopNestReorderInternalLoopNest() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
   Tensor* z = Compute(
       "z",
@@ -2824,9 +2860,9 @@ void testNormalizeAndSplitWithTail() {
 
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   // Input IR:
@@ -2872,9 +2908,10 @@ void testDetectInlineRankMismatch() {
   KernelScope kernel_scope;
   const int kTotalSize = 8;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Tensor* a = Compute(
-      "a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a_buf(i); });
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i);
+  });
   Tensor* reshape = Compute(
       "reshape",
       {{kTotalSize / 2, "i"}, {2, "j"}},
@@ -2882,7 +2919,7 @@ void testDetectInlineRankMismatch() {
   LoopNest l({reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
-      "Buffer indexed access is inconsistent with its rank");
+      "Placeholder indexed access is inconsistent with its rank");
 }
 
 } // namespace jit
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 2af8e33d3981..8436388f0d6b 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -7,9 +7,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "torch/csrc/jit/tensorexpr/analysis.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
@@ -25,7 +23,7 @@ using namespace torch::jit::tensorexpr;
 void testReduceSum1D() {
   KernelScope kernel_scope;
 
-  Buffer b(BufHandle("b", {10}, kFloat));
+  Placeholder b(BufHandle("b", {10}, kFloat));
   std::vector<float> in(10);
   for (int j = 0; j < 10; ++j) {
     in[j] = j;
@@ -54,7 +52,7 @@ void testReduceSum2D() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
@@ -92,7 +90,7 @@ void testReduceSum3D() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
   LoopNest loop({c});
@@ -140,7 +138,7 @@ void testReduceSum3D() {
   }
 
   // This is the same as just reducing the original result across that axis.
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
@@ -159,9 +157,9 @@ void testReduceSum3D() {
 void testReduceSum10D() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
+  Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
   const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
-  Buffer out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
+  Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
   const int OutputSize = 2 * 3 * 2 * 3 * 2;
 
   std::vector<float> in(InputSize, 1.f);
@@ -195,7 +193,7 @@ void testReduceProduct() {
   const int M = 4;
   const int N = 4;
 
-  Buffer b(BufHandle("b", {M, N}, kFloat));
+  Placeholder b(BufHandle("b", {M, N}, kFloat));
   std::vector<float> in(M * N);
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
@@ -232,7 +230,7 @@ void testReduceProduct() {
 void testReduceMax() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("b", {10}, kFloat));
+  Placeholder in_(BufHandle("b", {10}, kFloat));
 
   std::vector<float> in(10);
   std::vector<float> out(1, -1.f);
@@ -252,7 +250,7 @@ void testReduceMax() {
 
   ASSERT_EQ(out[0], 9);
 
-  Buffer in2_(BufHandle("b", {2, 5}, kFloat));
+  Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
   std::vector<float> out2(2, -1.f);
 
   Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
@@ -274,7 +272,7 @@ void testReduceMinCustomInitializer() {
   KernelScope kernel_scope;
 
   VarHandle minInit("minInit", kFloat);
-  Buffer in_(BufHandle("b", {10}, kFloat));
+  Placeholder in_(BufHandle("b", {10}, kFloat));
 
   std::vector<float> in(10);
   std::vector<float> out(1, -1.f);
@@ -286,7 +284,7 @@ void testReduceMinCustomInitializer() {
       "min",
       {},
       Minimum(ExprHandle(minInit)),
-      [&](ParameterList& v) { return in_.call(v); },
+      [&](ParameterList& v) { return in_.load(v); },
       {{10, "m"}});
 
   LoopNest loop({min});
@@ -312,7 +310,7 @@ void testReduceAnyAll() {
   KernelScope kernel_scope;
 
   VarHandle searchValue("searchValue", kInt);
-  Buffer b(BufHandle("b", {4, 10}, kInt));
+  Placeholder b(BufHandle("b", {4, 10}, kInt));
 
   Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
     return CompareSelect::make(a, 1, 1, b, kEQ);
@@ -323,7 +321,7 @@ void testReduceAnyAll() {
       {{4, "i"}},
       anyEqSV,
       [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b(i, j), searchValue, kEQ);
+        return CompareSelect::make(b.load(i, j), searchValue, kEQ);
       },
       {{10, "j"}});
 
@@ -366,7 +364,7 @@ void testReduceAnyAll() {
       {{4, "i"}},
       allGTSV,
       [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b(i, j), searchValue, kGT);
+        return CompareSelect::make(b.load(i, j), searchValue, kGT);
       },
       {{10, "j"}});
 
@@ -397,8 +395,8 @@ void testReduceAnyAll() {
 void testReduceMatmul2D() {
   KernelScope kernel_scope;
 
-  Buffer tA(BufHandle("tA", {3, 2}, kFloat));
-  Buffer tB(BufHandle("tB", {2, 3}, kFloat));
+  Placeholder tA(BufHandle("tA", {3, 2}, kFloat));
+  Placeholder tB(BufHandle("tB", {2, 3}, kFloat));
 
   std::vector<float> tA_(6);
   std::vector<float> tB_(6);
@@ -416,7 +414,7 @@ void testReduceMatmul2D() {
       {{3, "m"}, {3, "n"}},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return tA(m, k) * tB(k, n);
+        return tA.load(m, k) * tB.load(k, n);
       },
       {{2, "k"}});
 
@@ -439,7 +437,7 @@ void testReduceMatmul2D() {
 void testReduceRfactorLike() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {10, 10}, kFloat));
+  Placeholder in(BufHandle("in", {10, 10}, kFloat));
   std::vector<float> in_(100);
   for (int i = 0; i < 100; ++i) {
     in_[i] = i;
@@ -448,7 +446,7 @@ void testReduceRfactorLike() {
   std::vector<float> out(1, -1.f);
 
   Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Buffer in_rf(BufHandle(l1->func_var()));
+  Placeholder in_rf(BufHandle(l1->buf()));
 
   Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
 
@@ -469,15 +467,15 @@ void testReduceAsProducer() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer a(BufHandle("a", {2, 3}, kFloat));
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder a(BufHandle("a", {2, 3}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
   Tensor* d = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}},
       [&](const VarHandle& l, const VarHandle& n) {
-        return c->call(l, n) * a(l, n);
+        return c->call(l, n) * a.load(l, n);
       });
   LoopNest loop({d});
   loop.prepareForCodegen();
@@ -513,14 +511,14 @@ void testReduceAsConsumer() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer a(BufHandle("a", {2, 3, m}, kFloat));
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Compute(
       "scale",
       {{2, "l2"}, {3, "n1"}, {m, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b(l, n, m) * a(l, n, m);
+        return b.load(l, n, m) * a.load(l, n, m);
       });
   Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
   LoopNest loop({d});
@@ -559,7 +557,7 @@ void testReduceAsConsumer() {
 void testSplitReduceAxis() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {16, 8}, kFloat));
+  Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
   for (int i = 0; i < 16; ++i) {
@@ -593,7 +591,7 @@ void testSplitReduceAxis() {
 void testSplitNonReduceAxis() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {16, 8}, kFloat));
+  Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
   for (int i = 0; i < 16; ++i) {
@@ -637,7 +635,7 @@ void testReorderedReductionInitializer() {
         SumOp(c(k, n), 0, a(k, m, n), {m})
   */
 
-  Buffer in(BufHandle("in", {1, 12, 6}, kFloat));
+  Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
   std::vector<float> in_(12 * 6, 1.f);
 
   Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
@@ -685,7 +683,7 @@ void testReduceRfactor() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int j = 0; j < M * N; ++j) {
     in[j] = j;
@@ -720,7 +718,7 @@ void testReduce3DRfactorInternal() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -755,7 +753,7 @@ void testReduce3DRfactorInner() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -790,7 +788,7 @@ void testReduce3DRfactorOuter() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -826,7 +824,7 @@ void testReduce3DRfactorWithOuter() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {l, m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {l, m, n, k}, kFloat));
   std::vector<float> in(L * M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -861,7 +859,7 @@ void testReduce3DRfactorRepeated() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -907,7 +905,7 @@ void testReduceRfactorInsertionPoint() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int j = 0; j < M * N; ++j) {
     in[j] = j;
@@ -942,7 +940,7 @@ void testReduce3DRfactorInsertionPoint() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -969,7 +967,7 @@ void testReduce3DRfactorInsertionPoint() {
 void testReduceRepeatedInternalRfactor() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
+  Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
   const int InputSize = 2 * 3 * 4 * 5 * 6;
 
   std::vector<float> in(InputSize, 1.f);
@@ -1020,7 +1018,7 @@ void testReduceSplitTail() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1053,7 +1051,7 @@ void testReduceSplitNoTail() {
   const int M = 10;
   const int N = 10;
   const int K = 10;
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1088,7 +1086,7 @@ void testReduceOverSplitTail() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1122,7 +1120,7 @@ void testReduceSplitMask() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1155,7 +1153,7 @@ void testReduceSplitNoMask() {
   const int M = 10;
   const int N = 10;
   const int K = 10;
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1189,7 +1187,7 @@ void testReduceOverSplitMask() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1225,7 +1223,7 @@ void testReduceSplitRfactor() {
   const int K = 10;
   const int SPLIT_FACTOR = 4;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int m = 0; m < M; ++m) {
     for (int j = 0; j < N * K; ++j) {
@@ -1264,7 +1262,7 @@ void testReduceOverSplitRfactor() {
   const int K = 10;
   const int SPLIT_FACTOR = 16;
 
-  Buffer b(BufHandle("b", {N, K}, kFloat));
+  Placeholder b(BufHandle("b", {N, K}, kFloat));
   std::vector<float> in(N * K);
   for (int j = 0; j < N * K; ++j) {
     in[j] = j;
@@ -1314,12 +1312,12 @@ void testReduceInlineReduction() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
   Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf(m) + x->call(m);
+    return a_buf.load(m) + x->call(m);
   });
 
   PaddedBuffer<float> a_v(M);
@@ -1347,14 +1345,14 @@ void testReduceInlineConsumer() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M, N, K});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M, N, K});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n, k) + b_buf(m, n, k);
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
   Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
 
@@ -1401,14 +1399,14 @@ void testReduceInlineReducerInternal() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M, N, K});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M, N, K});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Compute(
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n, k) + b_buf(m, n, k);
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
 
   Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
index e7a28f1fb277..b286ab7b8151 100644
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -13,7 +13,7 @@ using namespace torch::jit::tensorexpr;
 // Can replace a simple scalar access with a local variable.
 void testRegisterizerSimple() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -58,7 +58,7 @@ void testRegisterizerSimple() {
 // Won't do replacement of a loop access.
 void testRegisterizerLoop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {10}, kInt));
+  BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -105,7 +105,7 @@ void testRegisterizerLoop() {
 // invalidate it.
 void testRegisterizerLoopFixedLoad() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -151,7 +151,7 @@ void testRegisterizerLoopFixedLoad() {
 // Will registerize multiple accesses of different items of the same buffer.
 void testRegisterizerMultiVar() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -207,8 +207,8 @@ void testRegisterizerMultiVar() {
 // Will registerize the valid accesses while skipping invalid replacements.
 void testRegisterizerVariableLoad() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {10}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
   VarHandle x2("x", kInt);
   Stmt* stmt = Block::make(
@@ -268,7 +268,7 @@ void testRegisterizerSymbolicIndices() {
   KernelScope kernel_scope;
   VarHandle i("i", kInt);
   VarHandle N("N", kInt);
-  Buffer a(BufHandle("A", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {i}, 0, 1),
@@ -317,7 +317,7 @@ void testRegisterizerSymbolicIndices() {
 // yet. Will have to fix soon though.
 void testRegisterizerEarlyStop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -344,7 +344,7 @@ void testRegisterizerEarlyStop() {
 // Can registerize accesses dependent on multiple loop vars.
 void testRegisterizerMultiLoop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   Stmt* stmt = Block::make(
@@ -402,7 +402,7 @@ void testRegisterizerMultiLoop() {
 // Can registerize correctly if scalars already exist in the program.
 void testRegisterizerRepeated() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -458,7 +458,7 @@ void testRegisterizerRepeated() {
 // Can registerize rthe load of A.
 void testRegisterizerNoLoads() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -499,8 +499,8 @@ void testRegisterizerNoLoads() {
 // Can registerize the load of A but not the store of B.
 void testRegisterizerNoRepeatedStores() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {10}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -548,7 +548,7 @@ void testRegisterizerNoRepeatedStores() {
 // Won't registerize if there are multiple accesses which may overlap.
 void testRegisterizerMultiVarOverlap() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -578,12 +578,12 @@ void testRegisterizerMultiVarOverlap() {
 void testRegisterizerAllocs() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {2}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {1}, kInt));
+  BufHandle a("A", {2}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {1}, kInt);
   VarHandle x("x", kInt);
 
-  VarHandle b_(b.data()->base_handle());
+  VarHandle b_(b.node()->base_handle());
 
   Stmt* stmt = Block::make(
       {Allocate::make(b_, kInt, {Load::make(c, {0}, 1)}),
@@ -646,7 +646,7 @@ void testRegisterizerAllocs() {
 
 void testRegisterizerNoInitializer() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -687,8 +687,8 @@ void testRegisterizerNoInitializer() {
 
 void testRegisterizerLoadThenStore() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -737,7 +737,7 @@ void testRegisterizerLoadThenStore() {
 
 void testRegisterizerParallelized() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   LoopOptions loopOpts;
   loopOpts.set_gpu_block_index(0);
@@ -765,7 +765,7 @@ void testRegisterizerParallelized() {
 
 void testRegisterizerConditions() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {5}, kInt));
+  BufHandle a("A", {5}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index b88aa17efd3e..22cd89a33c30 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -504,9 +504,9 @@ void testHashDifferenceTypes() {
 void testHashLargeExpression() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_stmt = For::make(
@@ -522,8 +522,8 @@ void testHashLargeExpression() {
               CompareSelectOperation::kEQ),
           mask));
 
-  Buffer d(BufHandle("D", {1}, kInt));
-  Buffer e(BufHandle("E", {1}, kInt));
+  BufHandle d("D", {1}, kInt);
+  BufHandle e("E", {1}, kInt);
   auto store_ramp_stmt = Store::make(
       e,
       {Ramp::make(0, 1, 4)},
@@ -555,9 +555,9 @@ void testHashLargeExpression() {
 void testHashForLoopOptions() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto for_stmt = For::make(
@@ -2632,8 +2632,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant true then take the true_value.
     // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(1);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2648,8 +2648,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant false then take the false_value.
     // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(0);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2665,8 +2665,8 @@ void testSimplifyConstantCond() {
     // condition is simplified before checking.
     // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2682,7 +2682,7 @@ void testSimplifyConstantCond() {
     // If both branches are the same then don't do the condition.
     // x ? A[0] = x : A[0] = x => A[0] = x
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, x, 1);
@@ -2698,7 +2698,7 @@ void testSimplifyConstantCond() {
     // If both branches simplify to the same thing it still works.
     // x ? (x + x) : (2 * x) => x
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
     Stmt* false_val = Store::make(a, {0}, x + x, 1);
@@ -2714,7 +2714,7 @@ void testSimplifyConstantCond() {
     // But not if they dont
     // x ? x : (2 * x) => x ? x : (2 * x)
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
@@ -2771,8 +2771,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Will eliminate zero loop For.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2784,8 +2784,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // still works if start is not zero.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2798,8 +2798,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2812,8 +2812,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2825,8 +2825,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2841,8 +2841,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Will remove the loop if the body is run once.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2856,8 +2856,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // still works if start is not zero.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2872,8 +2872,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2888,8 +2888,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2903,8 +2903,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2919,8 +2919,8 @@ void testSimplifyForWontLoseLoopOptions() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     LoopOptions options;
@@ -2939,8 +2939,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Multiple layers of For will be simplified out.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2956,8 +2956,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain an outer loop if the inner loop is eliminated.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2979,8 +2979,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain inner loop if outer loops is eliminated.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -3003,7 +3003,7 @@ void testSimplifyForCleansUp() {
   KernelScope kernel_scope;
 
   {
-    Buffer a("a", kFloat, {1, 12, 1});
+    Placeholder a("a", kFloat, {1, 12, 1});
     VarHandle x("x", kInt);
     Tensor* b = Compute(
         "x",
@@ -3051,7 +3051,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple blocks down to one.
     // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3074,7 +3074,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple sub blocks containing statements.
     // { { stmt1 }, { stmt2 } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3097,7 +3097,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten sub blocks with different depths.
     // { stmt1 , { { stmt2 } } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3240,9 +3240,9 @@ void testDontSimplifyRand() {
 
 void testSimplifyReorderForCond() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {4}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {4}, kInt));
+  BufHandle a("A", {4}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {4}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3440,8 +3440,8 @@ void testSimplifyReorderForCond() {
 
 void testSimplifyFuseConditions() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
-  Buffer b(BufHandle("B", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
+  BufHandle b("B", {2}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3858,7 +3858,7 @@ void testSimplifyFuseConditions() {
 
 void testSimplifySyncThreads() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {4}, kInt));
+  BufHandle a("A", {4}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
 
@@ -3950,5 +3950,45 @@ void testSimplifySyncThreads() {
   }
 }
 
+void testSimplifyRampSubBroadcast() {
+  KernelScope kernel_scope;
+  int num_lanes = 4;
+  ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes);
+  ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes);
+  ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast);
+  Ramp* newRamp = simplified.AsNode<Ramp>();
+  IS_NODE_WITH_NAME(IntImm, newRamp->base(), base);
+  ASSERT_EQ(base->value(), 5);
+  IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride);
+  ASSERT_EQ(stride->value(), 6);
+  ASSERT_EQ(newRamp->lanes(), num_lanes);
+}
+
+void testSimplifyBroadcastTermExpander() {
+  KernelScope kernel_scope;
+  int num_lanes = 8;
+  ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes);
+  ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes);
+  ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes);
+  // NB: We need a term in the middle which isn't simplified to trigger the
+  // relevant path in TermExpander::mutate. The two bc1 terms are brought
+  // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
+  ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
+  BufHandle buf("buf", {num_lanes}, kInt);
+  // The result isn't fully simplified currently and thus would be brittle to
+  // match. Observe its value instead.
+  auto store = Store::make(
+      buf,
+      {Ramp::make(0, 1, num_lanes)},
+      simplified,
+      Broadcast::make(ExprHandle(1), num_lanes));
+  SimpleIREvaluator eval(store, buf);
+  std::vector<int> output(num_lanes);
+  eval(output);
+  for (int i = 0; i < num_lanes; ++i) {
+    ASSERT_EQ(output[i], 2);
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 680311685375..826cf7209346 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -151,7 +151,7 @@ void testFuserPass_UnknownShapes() {
           %y : Tensor):
       %a : Tensor = aten::mul(%x, %y)
       %b : Tensor = aten::mul(%x, %a)
-      return (%a))IR";
+      return (%b))IR";
   auto g = std::make_shared<Graph>();
   torch::jit::parseIR(graph_string, g.get());
 
@@ -311,5 +311,24 @@ void testFuserPass_MergeGroups() {
       ->run(*g);
 }
 
+void testFuserPass_UnknownShapesIgnored() {
+  WithCPUFuser cf;
+  KernelScope kernel_scope;
+  const auto graph_string = R"IR(
+    graph(%x : Float(device=cpu),
+          %y : Float(device=cpu)):
+      %a : Float(device=cpu) = aten::mul(%x, %y)
+      %b : Float(device=cpu) = aten::mul(%x, %a)
+      return (%b))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 2, /* disable_shape_checks= */ true);
+
+  // Test that we are generating fusion groups even though shapes are not known
+  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/test_train.cpp b/test/cpp/tensorexpr/test_train.cpp
index aa2426050324..755d482dc2b4 100644
--- a/test/cpp/tensorexpr/test_train.cpp
+++ b/test/cpp/tensorexpr/test_train.cpp
@@ -2,9 +2,7 @@
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_base.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
@@ -56,7 +54,7 @@ void testTrainBasic() {
     auto C = call("mul", {A, B})[0];
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -85,7 +83,7 @@ void testTrainBasic() {
     auto dA = grad(D, A, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -117,7 +115,7 @@ void testTrainBasic() {
     auto C = A + B;
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -146,7 +144,7 @@ void testTrainBasic() {
     auto dA = D.grad(A, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -181,7 +179,7 @@ void testTrainBasic() {
     auto dC = (C * C).grad(B, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -209,7 +207,7 @@ void testTrainBasic() {
     auto X = T(g, {"K"});
     auto Y = X.sum();
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -229,7 +227,7 @@ void testTrainBasic() {
     auto Y = X.sum();
     auto Z = Y.broadcast_like(X);
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -266,7 +264,7 @@ void testTrainBasic() {
     auto new_W = W - W_grad;
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
diff --git a/test/cpp/tensorexpr/test_train.h b/test/cpp/tensorexpr/test_train.h
index 39674933aa9c..16ff667860d0 100644
--- a/test/cpp/tensorexpr/test_train.h
+++ b/test/cpp/tensorexpr/test_train.h
@@ -37,7 +37,7 @@ VTensor* grad(VTensor* y, VTensor* x, VTensor* j);
 std::string dot(const VGraph& g);
 std::tuple<
     torch::jit::tensorexpr::Stmt*,
-    std::map<const VTensor*, torch::jit::tensorexpr::Buffer>,
+    std::map<const VTensor*, torch::jit::tensorexpr::Placeholder>,
     std::map<const VTensor*, torch::jit::tensorexpr::Tensor*>,
     std::map<std::string, torch::jit::tensorexpr::VarHandle>>
 to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs = {});
diff --git a/test/cpp/tensorexpr/test_train_impl.cpp b/test/cpp/tensorexpr/test_train_impl.cpp
index 1636b583cef9..b9b7d33b129b 100644
--- a/test/cpp/tensorexpr/test_train_impl.cpp
+++ b/test/cpp/tensorexpr/test_train_impl.cpp
@@ -1,8 +1,6 @@
 #include "test/cpp/tensorexpr/test_train.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
@@ -408,7 +406,7 @@ std::string dot(const VGraph& g) {
 
 std::tuple<
     Stmt*,
-    std::map<const VTensor*, Buffer>,
+    std::map<const VTensor*, Placeholder>,
     std::map<const VTensor*, Tensor*>,
     std::map<std::string, VarHandle>>
 to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
@@ -458,7 +456,7 @@ to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
     return order;
   };
 
-  std::map<const VTensor*, Buffer> inputs;
+  std::map<const VTensor*, Placeholder> inputs;
   std::map<const VTensor*, Tensor*> bindings;
   std::map<std::string, torch::jit::tensorexpr::VarHandle> vbindings;
 
@@ -481,10 +479,10 @@ to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
       if (vars.size() == 0) {
         vars.emplace_back(IntImm::make(1));
       }
-      Buffer inpB(BufHandle(get_name(id), exprs, kFloat));
+      Placeholder inpB(BufHandle(get_name(id), exprs, kFloat));
       auto inpT =
           Compute("input" + get_name(id), vars, [&](const VarHandle& i) {
-            return Load::make(inpB, {i}, 1);
+            return Load::make(BufHandle(inpB.data()), {i}, 1);
           });
       inputs.emplace(&t, inpB);
       bindings.emplace(&t, inpT);
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index 20206a348d25..dc21373f241f 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -23,6 +23,7 @@ namespace jit {
   _(ExprLongTest)                           \
   _(ExprHalfTest)                           \
   _(ExprDoubleTest)                         \
+  _(ExprDisallowBoolArithmetic)             \
   _(ExprVectorAdd01)                        \
   _(ExprCompareSelectEQ)                    \
   _(ExprCompareSelectDtypes)                \
@@ -55,6 +56,7 @@ namespace jit {
   _(ExprSplitWithTail)                      \
   _(ExprSplitWithTailNone)                  \
   _(ExprSplitWithMask01)                    \
+  _(ExprSplitWithMaskRepeatedNoMask)        \
   _(SplitWithTailWithLoopOptions)           \
   _(SplitWithMaskWithLoopOptions)           \
   _(ScheduleBroadcastAddBuffer)             \
@@ -216,6 +218,8 @@ namespace jit {
   _(SimplifyReorderForCond)                 \
   _(SimplifyFuseConditions)                 \
   _(SimplifySyncThreads)                    \
+  _(SimplifyRampSubBroadcast)               \
+  _(SimplifyBroadcastTermExpander)          \
   _(RegisterizerSimple)                     \
   _(RegisterizerLoop)                       \
   _(RegisterizerLoopFixedLoad)              \
@@ -291,6 +295,7 @@ namespace jit {
   _(FuserPass_0DimInput)                    \
   _(FuserPass_UnfusibleDevice)              \
   _(FuserPass_UnknownShapes)                \
+  _(FuserPass_UnknownShapesIgnored)         \
   _(FuserPass_Multidevice)                  \
   _(FuserPass_MergeGroups)                  \
   _(TrainBasic)
@@ -440,6 +445,7 @@ namespace jit {
   _(CudaSigmoid)                           \
   _(CudaHalfCast)                          \
   _(CudaHalfSupport)                       \
+  _(CudaHalfPropagation)                   \
   _(CudaPrioritizeDependents)              \
   _(CudaMaskBlockDim)                      \
   _(CudaMaskThreadDim)                     \
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
new file mode 100644
index 000000000000..f0bcfc4c2485
--- /dev/null
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -0,0 +1,426 @@
+// *** Tensor Expressions ***
+//
+// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
+// work with them, and outlines how they are used in the overall TorchScript
+// compilation pipeline. This doc is permanently a "work in progress" since NNC
+// is under active development and things change fast.
+//
+// This Tutorial's code is compiled in the standard pytorch build, and the
+// executable can be found in `build/bin/tutorial_tensorexpr`.
+//
+// *** What is NNC ***
+//
+// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
+// and it performs on-the-fly code generation for kernels, which are often a
+// combination of multiple aten (torch) operators.
+//
+// When the JIT interpreter executes a torchscript model, it automatically
+// extracts subgraphs from the torchscript IR graph for which specialized code
+// can be JIT generated. This usually improves performance as the 'combined'
+// kernel created from the subgraph could avoid unnecessary memory traffic that
+// is unavoidable when the subgraph is interpreted as-is, operator by operator.
+// This optimization is often referred to as 'fusion'. Relatedly, the process of
+// finding and extracting subgraphs suitable for NNC code generation is done by
+// a JIT pass called 'fuser'.
+//
+// *** What is TE ***
+//
+// TE stands for Tensor Expressions. TE is a commonly used approach for
+// compiling kernels performing tensor (~matrix) computation. The idea behind it
+// is that operators are represented as a mathematical formula describing what
+// computation they do (as TEs) and then the TE engine can perform mathematical
+// simplification and other optimizations using those formulas and eventually
+// generate executable code that would produce the same results as the original
+// sequence of operators, but more efficiently.
+//
+// NNC's design and implementation of TE was heavily inspired by Halide and TVM
+// projects.
+#include <iostream>
+#include <string>
+
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+using namespace torch::jit::tensorexpr;
+
+int main(int argc, char* argv[]) {
+  // Memory management for tensor expressions is currently done with memory
+  // arenas. That is, whenever an object is created it registers itself in an
+  // arena and the object is kept alive as long as the arena is alive. When the
+  // arena gets destructed, it deletes all objects registered in it.
+  //
+  // The easiest way to set up a memory arena is to use `KernelScope` class - it
+  // is a resource guard that creates a new arena on construction and restores
+  // the previously set arena on destruction.
+  //
+  // We will create a kernel scope here, and thus we'll set up a mem arena for
+  // the entire tutorial.
+  KernelScope kernel_scope;
+
+  std::cout << "*** Structure of tensor expressions ***" << std::endl;
+  {
+    // A tensor expression is a tree of expressions. Each expression has a type,
+    // and that type defines what sub-expressions it the current expression has.
+    // For instance, an expression of type 'Mul' would have a type 'kMul' and
+    // two subexpressions: LHS and RHS. Each of these two sub-expressions could
+    // also be a 'Mul' or some other expression.
+    //
+    // Let's construct a simple TE:
+    Expr* lhs = new IntImm(5);
+    Expr* rhs = new Var("x", kInt);
+    Expr* mul = new Mul(lhs, rhs);
+    std::cout << "Tensor expression: " << *mul << std::endl;
+    // Prints: Tensor expression: 5 * x
+
+    // Here we created an expression representing a 5*x computation, where x is
+    // an int variable.
+
+    // Another, probably a more convenient, way to construct tensor expressions
+    // is to use so called expression handles (as opposed to raw expressions
+    // like we did in the previous example). Expression handles overload common
+    // operations and allow us to express the same semantics in a more natural
+    // way:
+    ExprHandle l = 1;
+    ExprHandle r = Var::make("x", kInt);
+    ExprHandle m = l * r;
+    std::cout << "Tensor expression: " << *m.node() << std::endl;
+    // Prints: Tensor expression: 1 * x
+
+    // In a similar fashion we could construct arbitrarily complex expressions
+    // using mathematical and logical operations, casts between various data
+    // types, and a bunch of intrinsics.
+    ExprHandle a = Var::make("a", kInt);
+    ExprHandle b = Var::make("b", kFloat);
+    ExprHandle c = Var::make("c", kFloat);
+    ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
+    std::cout << "Tensor expression: " << *x.node() << std::endl;
+    // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
+
+    // An ultimate purpose of tensor expressions is to optimize tensor
+    // computations, and in order to represent accesses to tensors data, there
+    // is a special kind of expression - a load.
+    // To construct a load we need two pieces: the base and the indices. The
+    // base of a load is a Buf expression, which could be thought of as a
+    // placeholder similar to Var, but with dimensions info.
+    //
+    // Let's construct a simple load:
+    BufHandle A("A", {ExprHandle(64), ExprHandle(32)}, kInt);
+    ExprHandle i = Var::make("i", kInt), j = Var::make("j", kInt);
+    ExprHandle load = Load::make(A.dtype(), A, {i, j}, /* mask= */ 1);
+    std::cout << "Tensor expression: " << *load.node() << std::endl;
+    // Prints: Tensor expression: A[i, j]
+  }
+
+  std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl;
+  {
+    // A tensor computation is represented by objects of Tensor class and
+    // consists of the following pieces:
+    //   - domain, which is specified by a Buf expression
+    //   - an expression (or several expressions if we want to perform several
+    //   independent computations over the same domain) for its elements, as a
+    //   function of indices
+    //
+    // We use Function objects to represent this. Let's build one.
+    //
+    // First, we need to specify the domain, or dimensions in which the
+    // computation would be performed. Let's create a 64x32 domain:
+    std::vector<const Expr*> dims = {
+        new IntImm(64), new IntImm(32)}; // IntImm stands for Integer Immediate
+                                         // and represents an integer constant
+
+    // Next we need to create Function arguments. The arguments of a Function
+    // are Vars, and they play role of placeholders. The computation that the
+    // function would describe would use these arguments.
+    const Var* i = new Var("i", kInt);
+    const Var* j = new Var("j", kInt);
+    std::vector<const Var*> args = {i, j};
+
+    // Now we can define the function computations using these arguments. Let's
+    // create two computations, the first would add the arguments of the
+    // function, the second would multiply them.
+    Expr* func_body1 = new Mul(i, j);
+    Expr* func_body2 = new Add(i, j);
+
+    // Finally, we pass all these pieces together to Function constructor:
+    Function* func =
+        new Function({"X", "Y"}, dims, args, {func_body1, func_body2});
+    // Under the hood function constructor would create separate `Buf`
+    // expressions for each computation (which can be accessed via
+    // `func->func_var(idx)`) with the names specified by the first parameter of
+    // the constructor call. In our example two `Buf` variables will be created
+    // with names 'X' and 'Y', each of them would signify a domain of 64x32.
+
+    // We can now print out our function:
+    std::cout << "Tensor function: " << *func << std::endl;
+    // Prints:
+    // Tensor function: Function F(i[64], j[32]) {
+    //   X = i * j
+    //   Y = i + j
+    // }
+
+    // A Tensor refers to an individual computation defined by a Function. For
+    // instance, we could create a following tensor given the function above:
+    int output_idx = 0; // Used to index the computation
+    Tensor* X = new Tensor(func, output_idx);
+    std::cout << "Tensor computation: " << *X << std::endl;
+    // Prints: Tensor computation: Tensor X(i[64], j[32]) = i * j
+
+    // Similarly to how we provide a more convenient way of using handles for
+    // constructing Exprs, Tensors also have a more convenient API for
+    // construction. It is based on Compute functions, which take a name:
+    // dimensions, and a lambda specifying the computation body:
+    Tensor* Z = Compute(
+        "Z",
+        {{64, "i"}, {32, "j"}},
+        [](const VarHandle& i, const VarHandle& j) { return i / j; });
+    std::cout << "Tensor computation: " << *Z << std::endl;
+    // Prints: Tensor computation: Tensor Z(i[64], j[32]) = i / j
+
+    // Tensors might access other tensors and external placeholders in their
+    // expressions. It can be done like so:
+    Placeholder P("P", kFloat, {64, 32});
+    Tensor* R = Compute(
+        "R",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return Z->call(i, j) * P.load(i, j);
+        });
+    std::cout << "Tensor computation: " << *R << std::endl;
+    // Prints: Tensor computation: Tensor R(i[64], j[32]) = Z(i, j) * P[i, j]
+
+    // Placeholders could be thought of as external tensors, i.e. tensors for
+    // which we don't have the element expression. In other words, for `Tensor`
+    // we know an expression specifying how its elements can be computed (a
+    // mathematical formula). For external tensors, or placeholders, we don't
+    // have such an expression. They need to be considered as coming to us as
+    // inputs from outside - we can only load data from them.
+    //
+    // Also note that we use 'call' to construct an access to an element of a
+    // Tensor and we use 'load' for accessing elements of an external tensor
+    // through its Placeholder. This is an implementation detail and could be
+    // changed in future.
+    //
+    // Why do we have Functions and Tensors and what is the relationship between
+    // them? Functions are used to represent several computations performed over
+    // the same domain. Tensors refer to individual computations of a Function.
+    //
+    // Also note that currently a lot of code only supports single-output
+    // Functions, in which case they become almost identical to Tensors. This
+    // probably will be changed in future.
+
+    // TODO: Show how reductions are represented and constructed
+  }
+
+  std::cout << "*** Loopnests and Statements ***" << std::endl;
+  {
+    // Creating a tensor expression is the first step to generate an executable
+    // code for it. A next step is to represent it as a loop nest and apply
+    // various loop transformations in order to get an optimal implementation.
+    // In Halide's or TVM's terms the first step was to define the algorithm of
+    // computation (what to compute?) and now we are getting to the schedule of
+    // the computation (how to compute?).
+    //
+    // Let's create a simple tensor expression and construct a loop nest for it.
+    Placeholder A("A", kFloat, {64, 32});
+    Placeholder B("B", kFloat, {64, 32});
+    Tensor* X = Compute(
+        "X",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return A.load(i, j) + B.load(i, j);
+        });
+    Tensor* Y = Compute(
+        "Y",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return sigmoid(X->call(i, j));
+        });
+    std::cout << "Tensor computation X: " << *X
+              << "Tensor computation Y: " << *Y << std::endl;
+    // Prints:
+    // Tensor computation X: Tensor X(i[64], j[32]) = (A[i, j]) + (B[i, j])
+    // Tensor computation Y: Tensor Y(i[64], j[32]) = sigmoid(X(i, j))
+
+    // Creating a loop nest is as quite simple, we just need to specify what are
+    // the output tensors in our computation and LoopNest object will
+    // automatically pull all tensor dependencies:
+    LoopNest loopnest({Y});
+
+    // An IR used in LoopNest is based on tensor statements, represented by
+    // `Stmt` class. Statements are used to specify the loop nest structure, and
+    // to take a sneak peek at them, let's print out what we got right after
+    // creating our LoopNest object:
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //     }
+    //   }
+    //   for (int i_1 = 0; i_1 < 64; i_1++) {
+    //     for (int j_1 = 0; j_1 < 32; j_1++) {
+    //       Y[i_1, j_1] = sigmoid(X(i_1, j_1));
+    //     }
+    //   }
+    // }
+
+    // To introduce statements let's first look at their three main types (in
+    // fact, there are more than 3 types, but the other types would be easy to
+    // understand once the overall structure is clear):
+    //  1) Block
+    //  2) For
+    //  3) Store
+    //
+    // A `Block` statement is simply a list of other statements.
+    // A `For` is a statement representing one axis of computation. It contains
+    // an index variable (Var), boundaries of the axis (start and end - both are
+    // `Expr`s), and a `Block` statement body.
+    // A `Store` represents an assignment to a tensor element. It contains a Buf
+    // representing the target tensor, a list of expressions for indices of the
+    // element, and the value to be stored, which is an arbitrary expression.
+
+    // Once we've constructed the loop nest, we can apply various tranformations
+    // to it. To begin with, let's inline computation of X into computation of Y
+    // and see what happens to our statements.
+    loopnest.computeInline(loopnest.getLoopBodyFor(X));
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       Y[i, j] = sigmoid((A[i, j]) + (B[i, j]));
+    //     }
+    //   }
+    // }
+    //
+    // As you can see, the first two loops have disappeared and the expression
+    // for X[i,j] has been inserted into the Y[i,j] computation.
+
+    // Loop transformations can be composed, so we can do something else with
+    // our loop nest now. Let's split the inner loop with a factor of 9, for
+    // instance.
+    std::vector<For*> loops = loopnest.getLoopStmtsFor(Y);
+    For* j_outer;
+    For* j_inner;
+    For* j_tail;
+    int split_factor = 9;
+    loopnest.splitWithTail(
+        loops[1], // loops[0] is the outer loop, loops[1] is inner
+        split_factor,
+        &j_outer, // These are handles that we would be using for
+        &j_inner, // further transformations
+        &j_tail);
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j_outer = 0; j_outer < (32 - 0) / 9; j_outer++) {
+    //       for (int j_inner = 0; j_inner < 9; j_inner++) {
+    //         Y[i, j_outer * 9 + j_inner] = sigmoid((A[i, j_outer * 9 + ...
+    //       }
+    //     }
+    //     for (int j_tail = 0; j_tail < (32 - 0) % 9; j_tail++) {
+    //       Y[i, j_tail + ((32 - 0) / 9) * 9] = sigmoid((A[i, j_tail + ...
+    //     }
+    //   }
+    // }
+
+    // TODO: List all available transformations
+    // TODO: Show how statements can be constructed manually
+  }
+
+  std::cout << "*** Codegen ***" << std::endl;
+  {
+    // An ultimate goal of tensor expressions is to be provide a mechanism to
+    // execute a given computation in the fastest possible way. So far we've
+    // looked at how we could describe what computation we're interested in, but
+    // we haven't looked at how to actually execute it. So far all we've been
+    // dealing with was just symbols with no actual data associated, in this
+    // section we would look at how we can bridge that gap.
+
+    // Let's start by constructing a simple computation for us to work with:
+    Placeholder A("A", kInt, {64, 32});
+    Placeholder B("B", kInt, {64, 32});
+    Tensor* X = Compute(
+        "X",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return A.load(i, j) + B.load(i, j);
+        });
+
+    // And let's lower it to a loop nest, as we did in the previous section:
+    LoopNest loopnest({X});
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //     }
+    //   }
+
+    // Now imagine that we have two actual tensors 64x32 that we want sum
+    // together, how do we pass those tensors to the computation and how do we
+    // carry it out?
+    //
+    // Codegen object is aimed at providing exactly that functionality. Codegen
+    // is an abstract class and concrete codegens are derived from it.
+    // Currently, we have three codegens:
+    //  1) Simple Evaluator,
+    //  2) LLVM Codegen for CPU,
+    //  3) CUDA Codegen.
+    // In this example we will be using Simple Evaluator, since it's available
+    // everywhere.
+
+    // To create a codegen, we need to provide the statement - it specifies the
+    // computation we want to perform - and a list of placeholders and tensors
+    // used in the computation. The latter part is crucial since that's the only
+    // way the codegen could use to correlate symbols in the statement to actual
+    // data arrays that we will be passing when we will actually be performing
+    // the computation.
+    //
+    // Let's create a Simple IR Evaluator codegen for our computation:
+    SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
+
+    // We are using the simplest codegen and in it almost no work is done at the
+    // construction step. Real codegens such as CUDA and LLVM perform
+    // compilation during that stage so that when we're about to run the
+    // computation everything is ready.
+
+    // Let's now create some inputs and run our computation with them:
+    std::vector<int> data_A(64 * 32, 3); // This will be the input A
+    std::vector<int> data_B(64 * 32, 5); // This will be the input B
+    std::vector<int> data_X(64 * 32, 0); // This will be used for the result
+
+    // Now let's invoke our codegen to perform the computation on our data. We
+    // need to provide as many arguments as how many placeholders and tensors we
+    // passed at the codegen construction time. A position in these lists would
+    // define how real data arrays from the latter call (these arguments are
+    // referred to as 'CallArg's in our codebase) correspond to symbols
+    // (placeholders and tensors) used in the tensor expressions we constructed
+    // (these are referred to as 'BufferArg').
+    // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
+    // contains data for the placeholder A, data_B - for the placeholder B, and
+    // data_X would be used for contents of tensor X.
+    ir_eval(data_A, data_B, data_X);
+
+    // Let's print one of the elements from each array to verify that the
+    // computation did happen:
+    std::cout << "A[10] = " << data_A[10] << std::endl
+              << "B[10] = " << data_B[10] << std::endl
+              << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
+    // Prints:
+    // A[10] = 3
+    // B[10] = 5
+    // X[10] = A[10] + B[10] = 8
+  }
+
+  // TODO: Show how TorchScript IR is translated to TE
+  return 0;
+}
diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md
index b7ec61a5a958..66931b6f9316 100644
--- a/test/cpp_api_parity/parity-tracker.md
+++ b/test/cpp_api_parity/parity-tracker.md
@@ -88,11 +88,11 @@ torch::nn::GRU|Yes|No
 torch::nn::RNNCell|Yes|No
 torch::nn::LSTMCell|Yes|No
 torch::nn::GRUCell|Yes|No
-torch::nn::Transformer|No|No
+torch::nn::Transformer|Yes|No
 torch::nn::TransformerEncoder|No|No
 torch::nn::TransformerDecoder|No|No
-torch::nn::TransformerEncoderLayer|No|No
-torch::nn::TransformerDecoderLayer|No|No
+torch::nn::TransformerEncoderLayer|Yes|No
+torch::nn::TransformerDecoderLayer|Yes|No
 torch::nn::Identity|Yes|No
 torch::nn::Linear|Yes|No
 torch::nn::Bilinear|Yes|No
diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp
index 188484cf9248..b4901cdbcf4d 100644
--- a/test/cpp_extensions/cpp_c10d_extension.cpp
+++ b/test/cpp_extensions/cpp_c10d_extension.cpp
@@ -63,7 +63,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
     const BarrierOptions& opts) {
-  throw std::runtime_error("ProcessGroupTest does not support barrier");
+  return std::make_shared<ProcessGroupTest::WorkTest>();
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
diff --git a/test/custom_backend/backend.py b/test/custom_backend/backend.py
index 17e399d320a7..8b48ed0a4108 100644
--- a/test/custom_backend/backend.py
+++ b/test/custom_backend/backend.py
@@ -33,7 +33,7 @@ def to_custom_backend(module):
     Returns:
         The module, lowered so that it can run on TestBackend.
     """
-    lowered_module = torch._C._jit_to_backend("custom_backend", module._c, {"forward": {"": ""}})
+    lowered_module = torch._C._jit_to_backend("custom_backend", module, {"forward": {"": ""}})
     return lowered_module
 
 
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 2b3d43814c0f..37c8f14af853 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -14,6 +14,7 @@
     MultiProcessTestCase,
     requires_nccl,
     skip_if_lt_x_gpu,
+    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import run_tests
 
@@ -97,6 +98,7 @@ def _run_and_get_grads(self, model):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_allreduce_hook(self):
         """
         This unit test verifies the ``allreduce`` hook registered case gives same result
@@ -114,6 +116,7 @@ def test_ddp_comm_hook_allreduce_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_fp16compress_hook(self):
         """
         This unit test verifies the ``fp16 compress`` hook registered case
@@ -131,6 +134,7 @@ def test_ddp_comm_hook_fp16compress_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         """
         This unit test verifies the ``quantize per tensor`` hook registered case
@@ -148,6 +152,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_quantize_per_channel_hook(self):
         """
         This unit test verifies the ``quantize per channel`` hook registered case
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index cfd0930284a5..9d0c19bef7b3 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,8 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests
+    skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \
+    create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -255,6 +256,7 @@ def create_tcp_store(addr):
     raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
 
 
+@skip_if_win32()
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         store = create_tcp_store('localhost')
@@ -272,7 +274,32 @@ def test_address_already_in_use(self):
             store1 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
+    def _test_numkeys_delkeys(self, fs):
+        # We start off with one init key in the store to coordinate workers
+        self.assertEqual(fs.num_keys(), 1)
+        fs.add("key", 1)
+        fs.add("key", 2)
+        fs.add("key", 3)
+        fs.set("key0", "value0")
+        fs.add("key3", 1)
+        fs.set("key1", "value1")
+        self.assertEqual(fs.num_keys(), 5)
+        fs.delete_key("key")
+        self.assertEqual(fs.num_keys(), 4)
+        with self.assertRaises(RuntimeError):
+            fs.get("key")
+        fs.delete_key("key0")
+        fs.delete_key("key3")
+        self.assertEqual(fs.num_keys(), 2)
+        fs.set("key4", "value2")
+        self.assertEqual(fs.num_keys(), 3)
+        self.assertEqual(b"value1", fs.get("key1"))
+        self.assertEqual(b"value2", fs.get("key4"))
+
+    def test_numkeys_delkeys(self):
+        self._test_numkeys_delkeys(self._create_store())
 
+@skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         super(PrefixTCPStoreTest, self).setUp()
@@ -329,13 +356,14 @@ def test_unknown_handler(self):
             c10d.rendezvous('invalid://')
 
 
+@skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
+    @requires_nccl()
     def test_common_errors(self):
-        # TODO remove this hack
-        if not hasattr(c10d, "ProcessGroupNCCL"):
-            raise unittest.SkipTest("C10D is not built with NCCL process group,"
-                                    " skipping test")
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
+
         vars = {
             "WORLD_SIZE": "1",
             "RANK": "0",
@@ -455,7 +483,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile(delete=False) as file:
-            url = 'file://%s?world_size=%d' % (file.name, 2)
+            url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2'
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -474,6 +502,7 @@ def test_nominal(self):
             self.assertEqual(b"value1", store0.get("key1"))
 
 
+@skip_if_win32()
 class RendezvousTCPTest(TestCase):
 
     def create_tcp_url(self):
@@ -544,9 +573,13 @@ def _test_store_timeout(self, backend, init_method, c2p):
 
     def _init_methods(self):
         f = tempfile.NamedTemporaryFile(delete=False)
-        yield "file://%s" % f.name
-        f.close()
-        yield "tcp://127.0.0.1:%d" % common.find_free_port()
+        if sys.platform == 'win32':
+            yield "file:///%s" % f.name.replace("\\", "/")
+            f.close()
+        else:
+            yield "file://%s" % f.name
+            f.close()
+            yield "tcp://127.0.0.1:%d" % common.find_free_port()
 
     def _test_default_store_timeout(self, backend):
         for init_method in self._init_methods():
@@ -571,6 +604,8 @@ def _test_default_store_timeout(self, backend):
     @requires_nccl()
     @retry_on_connect_failures
     def test_default_store_timeout_nccl(self):
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
         self._test_default_store_timeout('nccl')
 
     @requires_gloo()
@@ -584,11 +619,16 @@ def test_default_store_timeout_gloo(self):
 class ProcessGroupGlooTest(MultiProcessTestCase):
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
-        self._fork_processes()
+
+        # For Windows platform, Python does not support fork, change it to spawn here.
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        opts.devices = [create_device(interface=LOOPBACK)]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
@@ -598,8 +638,8 @@ def test_multi_device_constructor(self):
         opts = c10d.ProcessGroupGloo.Options()
         opts.timeout = 5.0
         opts.devices = [
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
         ]
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
 
@@ -1514,6 +1554,7 @@ def test_barrier_implies_wait(self):
         for i, tensor in enumerate(tensors):
             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
 
+    @skip_if_win32()
     def test_round_robin(self):
         num_process_groups = 2
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -1531,6 +1572,7 @@ def test_round_robin(self):
             pg.broadcast(tensor, root=0).wait()
             self.assertEqual(torch.full([100, 100], 0.), tensor)
 
+    @skip_if_win32()
     def test_round_robin_create_destroy(self):
         store = c10d.FileStore(self.file_name, self.world_size)
 
@@ -1553,12 +1595,30 @@ def create(num, prefix):
                 self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor)
             del pg
 
+class ProcessGroupNCCLNoGPUTest(TestCase):
+    MAIN_PROCESS_RANK = 0
+
+    def setUp(self):
+        self.rank = self.MAIN_PROCESS_RANK
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            raise unittest.SkipTest("GPUs are available, skipping test")
+
+    def tearDown(self):
+        pass
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_init_no_gpus(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"):
+            c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
 
-@requires_nccl()
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "TSAN is not fork-safe since we're forking in a multi-threaded environment",
-)
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
@@ -1573,6 +1633,8 @@ def setUp(self):
     def tearDown(self):
         pass
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_empty_tensors(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1597,6 +1659,8 @@ def test_empty_tensors(self):
         pg.reduce_scatter(ys, xs).wait()
         self.assertEqual(0, ys[0].numel())
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_broadcast_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1619,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor):
             for i in range(self.num_gpus):
                 self.assertEqual(tensors[i], tensors[rt])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allreduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1680,6 +1746,8 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                 allreduce(tensors, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1710,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None):
                 with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                     reduce(tensors, self.rank, rt, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allgather_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1735,6 +1805,8 @@ def allgather(output_ts, input_ts):
             for s_idx, t in enumerate(device_ts):
                 self.assertEqual(torch.tensor([s_idx]), t)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_scatter_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1812,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op):
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             self.assertEqualIgnoreType(expected, output[i])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_barrier(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1936,7 +2010,7 @@ def forward(self, x):
         return self.p + x
 
 
-class TestDdpCommHook(nn.Module):
+class ModuleForDdpCommHook(nn.Module):
     def __init__(self):
         super().__init__()
         self.t0 = Task()
@@ -1958,7 +2032,10 @@ def forward(self, x):
 class DistributedDataParallelTest(MultiProcessTestCase):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
@@ -1973,13 +2050,15 @@ def tearDown(self):
     def world_size(self):
         return 2
 
-    def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size):
+    def _prepare_single_device_module(
+            self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False):
         model = Net()
         ddp_model = DistributedDataParallel(
             copy.deepcopy(model).to(devices[0]),
             device_ids=device_ids,
             process_group=process_group,
-            bucket_cap_mb=0.001)
+            bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view)
 
         model.to(devices[0])
 
@@ -1988,7 +2067,7 @@ def _prepare_single_device_module(self, process_group, devices, device_ids, glob
 
         return model, ddp_model, input, target
 
-    def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size):
+    def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False):
         self.assertTrue(
             len(devices) == 2 or len(devices) == 4,
             "unexpected devices for ddp tests {}".format(devices))
@@ -2001,14 +2080,15 @@ def _prepare_multi_device_module(self, process_group, devices, device_ids, globa
             copy.deepcopy(model),
             device_ids=device_ids,
             process_group=process_group,
-            bucket_cap_mb=0.001)
+            bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view)
 
         input = torch.randn(global_batch_size, 2).cuda(devices[0])
         target = torch.randn(global_batch_size, 4)
 
         return model, ddp_model, input, target
 
-    def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False):
+    def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         """
         Note: we pass down `device_ids` all the way to DistributedDataParallel
         as part of the test. Below you find tests that either use a list of
@@ -2022,11 +2102,11 @@ def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi
         if multi_device:
             model, ddp_model, input, target = \
                 self._prepare_multi_device_module(
-                    process_group, devices, device_ids, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view)
         else:
             model, ddp_model, input, target = \
                 self._prepare_single_device_module(
-                    process_group, devices, device_ids, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view)
 
         def step_model(model, input, target):
             model.train()
@@ -2061,17 +2141,21 @@ def update_parameters(model):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
-    def _test_gloo_backend(self, devices, device_ids, multi_device=False):
+    def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
-        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
     @requires_gloo()
     def test_gloo_backend_cpu_module(self):
         self._test_gloo_backend([torch.device("cpu")], [])
 
+    @requires_gloo()
+    def test_gloo_backend_cpu_module_grad_is_view(self):
+        self._test_gloo_backend([torch.device("cpu")], [], gradient_as_bucket_view=True)
+
     @requires_gloo()
     @skip_if_not_multigpu
     def test_gloo_backend_1gpu_module_device_ids_integer_list(self):
@@ -2088,6 +2172,7 @@ def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self):
 
     @requires_gloo()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_gloo_backend_2gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
@@ -2100,10 +2185,10 @@ def test_gloo_backend_4gpu_module(self):
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
         self._test_gloo_backend(devices, [], multi_device=True)
 
-    def _test_nccl_backend(self, devices, device_ids, multi_device=False):
+    def _test_nccl_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
     @requires_nccl()
     @skip_if_not_multigpu
@@ -2123,6 +2208,7 @@ def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_nccl_backend_2gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
@@ -2130,6 +2216,7 @@ def test_nccl_backend_2gpu_module(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(8)
+    @skip_if_rocm
     def test_nccl_backend_4gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:4]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
@@ -2137,6 +2224,7 @@ def test_nccl_backend_4gpu_module(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_ddp_multi_device_module_config(self):
         gpus = gpus_for_rank(self.world_size)[self.rank]
 
@@ -2165,9 +2253,7 @@ def test_ddp_multi_device_module_config(self):
             ddp_model = DistributedDataParallel(
                 model, device_ids=gpus, process_group=process_group)
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    def test_fp16(self):
+    def _test_fp16(self, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
 
@@ -2179,6 +2265,7 @@ def test_fp16(self):
             device_ids=[gpus[0]],
             process_group=process_group,
             bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view
         )
 
         # Input 2**15, so that the gradients will overflow with a
@@ -2198,7 +2285,17 @@ def test_fp16(self):
 
     @requires_nccl()
     @skip_if_not_multigpu
-    def test_arbitrary_forward_return_value(self):
+    @skip_if_rocm
+    def test_fp16(self):
+        self._test_fp16()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_fp16_grad_is_view(self):
+        self._test_fp16(gradient_as_bucket_view=True)
+
+    def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2234,6 +2331,7 @@ def forward(self, x, fn):
             ForwardReturnValueModule().float().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         batch_size = 4
@@ -2289,7 +2387,16 @@ def test(box, unbox):
     @requires_nccl()
     @skip_if_not_multigpu
     @skip_if_rocm
-    def test_find_unused_parameters_kwarg(self):
+    def test_arbitrary_forward_return_value(self):
+        self._test_arbitrary_forward_return_value()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_arbitrary_forward_return_value_grad_is_view(self):
+        self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True)
+
+    def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2319,12 +2426,13 @@ def forward(self, x):
         input = torch.rand([batch_size, 2], dtype=torch.float)
         target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
 
-        def test_find_unused_parameters(find_unused_parameters, test_default=False):
+        def test_find_unused_parameters(find_unused_parameters, test_default=False, gradient_as_bucket_view=False):
             if test_default:
                 model = DistributedDataParallel(
                     FindUnusedParametersModule().float().to(device_id),
                     device_ids=[device_id],
                     process_group=process_group,
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
             else:
                 model = DistributedDataParallel(
@@ -2332,6 +2440,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
                     device_ids=[device_id],
                     process_group=process_group,
                     find_unused_parameters=find_unused_parameters,
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
 
             output, fc3 = model(input)
@@ -2343,7 +2452,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
         # trigger an error when `backward` is called (because fc3 is an unused
         # parameter and will therefore be marked ready twice).
         try:
-            test_find_unused_parameters(True)
+            test_find_unused_parameters(True, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.assertTrue(
                 str(ex).startswith("Expected to mark a variable ready only once."))
@@ -2353,19 +2462,29 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
         # Then test that the default behavior can be overridden by setting
         # `find_unused_parameters=False`.
         try:
-            test_find_unused_parameters(False)
+            test_find_unused_parameters(False, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.fail("Unexpected exception: %s" % ex)
 
         # Test find_unused_parameters defaults to False
         try:
-            test_find_unused_parameters(True, test_default=True)
+            test_find_unused_parameters(True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.fail("Unexpected exception: %s" % ex)
 
-    @requires_gloo()
-    @skip_if_lt_x_gpu(2)
-    def test_global_local_unused_params_grad(self):
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_find_unused_parameters_kwarg(self):
+        self._test_find_unused_parameters_kwarg()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_find_unused_parameters_kwarg_grad_is_view(self):
+        self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True)
+
+    def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False):
         """
         By simulating a multi-task training, this test is to make sure:
         1) DDP does not touch the grad of globally unused parameters.
@@ -2411,6 +2530,7 @@ def run_and_verify_grad(model):
             GlobalLocalUnusedParamModule().cpu(),
             process_group=process_group,
             find_unused_parameters=True,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
         run_and_verify_grad(cpu_model)
 
@@ -2421,9 +2541,20 @@ def run_and_verify_grad(model):
             device_ids=[device_id],
             process_group=process_group,
             find_unused_parameters=True,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
         run_and_verify_grad(gpu_model)
 
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_global_local_unused_params_grad(self):
+        self._test_global_local_unused_params_grad()
+
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_global_local_unused_params_grad_with_grad_is_view(self):
+        self._test_global_local_unused_params_grad(gradient_as_bucket_view=True)
+
     @requires_gloo()
     @skip_if_lt_x_gpu(2)
     def test_find_unused_parameters_when_unused_parameters_empty(self):
@@ -2480,9 +2611,7 @@ def run_and_verify_grad(model):
         )
         run_and_verify_grad(gpu_model)
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    def test_multiple_outputs_multiple_backward(self):
+    def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2516,6 +2645,7 @@ def forward(self, x):
             MultipleOutputModule().float().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         batch_size = 4
@@ -2532,6 +2662,19 @@ def forward(self, x):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_multiple_outputs_multiple_backward(self):
+        self._test_multiple_outputs_multiple_backward()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_multiple_outputs_multiple_backward_grad_is_view(self):
+        self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True)
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
     def test_no_grad(self):
         """
         Note: this test can be sped up by only running it on a CPU module
@@ -2578,7 +2721,7 @@ def check_no_grads():
         # No parameter should have their gradient set.
         check_no_grads()
 
-    def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None):
+    def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False):
         """
         This is the recommended way to implement accumulate grads.
         If ``ddp_comm_hook`` input was specified, it will also register that hook
@@ -2593,7 +2736,7 @@ def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None):
         local_batch_size = len(devices)
 
         model, ddp_model, input, target = self._prepare_single_device_module(
-            process_group, devices, devices, global_batch_size
+            process_group, devices, devices, global_batch_size, gradient_as_bucket_view
         )
 
         if ddp_comm_hook is not None:
@@ -2643,6 +2786,7 @@ def step_model(model, input, target):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync(self):
         """
         Runs _test_accumulate_gradients_no_sync using default inputs
@@ -2651,6 +2795,16 @@ def test_accumulate_gradients_no_sync(self):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_no_sync_grad_is_view(self):
+        """
+        Runs _test_accumulate_gradients_no_sync using default inputs
+        """
+        self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True)
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync_allreduce_hook(self):
         """
         Runs multiple iterations on _test_accumulate_gradients_no_sync
@@ -2670,6 +2824,7 @@ def allreduce_hook(
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
         """
         Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
@@ -2697,9 +2852,7 @@ def div(fut):
             num_iters=4, ddp_comm_hook=allreduce_with_then_hook
         )
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    def test_accumulate_gradients_module(self):
+    def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False):
         # This is NOT the recommended way to implement accumulating grads, but
         # we would like to make sure DDP does not mess up with the underlying
         # module.
@@ -2711,7 +2864,7 @@ def test_accumulate_gradients_module(self):
 
         model, ddp_model, input, target = \
             self._prepare_single_device_module(
-                process_group, devices, devices, global_batch_size)
+                process_group, devices, devices, global_batch_size, gradient_as_bucket_view)
 
         def step_model(model, input, target):
             model.train()
@@ -2751,6 +2904,18 @@ def step_model(model, input, target):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_module(self):
+        self._test_accumulate_gradients_module()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_module_with_grad_is_view(self):
+        self._test_accumulate_gradients_module(gradient_as_bucket_view=True)
+
     @requires_gloo()
     def test_ignored_output(self):
         """
@@ -2840,6 +3005,7 @@ def forward(self, x):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_failure_recovery(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -2903,6 +3069,92 @@ def forward(self, x):
             loss = criterion(output, target)
             loss.backward()
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_save_load_checkpoint(self):
+        dist.init_process_group(
+            "gloo",
+            init_method=f"file://{self.file_name}",
+            world_size=self.world_size,
+            rank=self.rank
+        )
+
+        class TestModel(nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                return F.softmax(x, dim=1)
+
+        def train_loop(model, optimizer, iterations):
+            for _ in range(iterations):
+                optimizer.zero_grad()
+                output = model(input)
+                loss = criterion(output, target)
+                loss.backward()
+                optimizer.step()
+
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+
+        model_withload = TestModel().float().to(device_id)
+        model_withoutload = TestModel().float().to(device_id)
+
+        ddp_withload = DistributedDataParallel(
+            model_withload,
+            device_ids=[device_id],
+        )
+        ddp_withoutload = DistributedDataParallel(
+            model_withoutload,
+            device_ids=[device_id],
+        )
+
+        # ensure that both models start with the same set of parameters. By default they are randomized on construction
+        for p in ddp_withload.parameters():
+            with torch.no_grad():
+                p.zero_()
+        for p in ddp_withoutload.parameters():
+            with torch.no_grad():
+                p.zero_()
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+
+        optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001)
+        optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001)
+
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
+
+        # run the model for 6 iterations, with a checkpoint in the middle
+        train_loop(ddp_withload, optimizer_withload, 3)
+
+        # zero out parameters and reload them from the state dict
+        checkpoint_path = tempfile.gettempdir() + "/model.checkpoint"
+        if self.rank == 0:
+            torch.save(ddp_withload.state_dict(), checkpoint_path)
+
+        dist.barrier()
+        for p in ddp_withload.parameters():
+            with torch.no_grad():
+                p.zero_()
+        map_location = {'cuda:%d' % 0: 'cuda:%d' % self.rank}
+        ddp_withload.load_state_dict(
+            torch.load(checkpoint_path, map_location=map_location))
+
+        train_loop(ddp_withload, optimizer_withload, 3)
+
+        # re-run the model with the same inputs for 6 iterations with no checkpoint
+        train_loop(ddp_withoutload, optimizer_withoutload, 6)
+
+        for p_withload, p_withoutload in zip(ddp_withload.parameters(), ddp_withoutload.parameters()):
+            self.assertEqual(p_withload, p_withoutload)
+
+
     def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         mult = 2
         batch_size = mult * self.world_size
@@ -2923,8 +3175,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         ddp_parameter = next(ddp_model.parameters())
         self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad)
 
-    @requires_gloo()
-    def test_sparse_gradients(self):
+    def _test_sparse_gradients(self, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
@@ -2935,10 +3186,19 @@ def test_sparse_gradients(self):
         ddp_model = DistributedDataParallel(
             copy.deepcopy(vanilla_model),
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
 
+    @requires_gloo()
+    def test_sparse_gradients(self):
+        self._test_sparse_gradients()
+
+    @requires_gloo()
+    def test_sparse_gradients_grad_is_view(self):
+        self._test_sparse_gradients(gradient_as_bucket_view=True)
+
     def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -3097,7 +3357,7 @@ def test_ddp_comm_hook_future_passing_cpu(self):
 
         # Test on CPU
         cpu_model = DistributedDataParallel(
-            TestDdpCommHook().cpu(), process_group=process_group
+            ModuleForDdpCommHook().cpu(), process_group=process_group
         )
 
         # Register DDP Communication Hook
@@ -3107,12 +3367,13 @@ def test_ddp_comm_hook_future_passing_cpu(self):
         # without the comm_hook, result would be 0.25 * torch.ones(2, 2).
         self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2))
 
-    def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None):
+    def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False):
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         gpu_model = DistributedDataParallel(
-            TestDdpCommHook().to(device_id),
+            ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         # Register DDP Communication Hook if defined
@@ -3161,6 +3422,7 @@ def test_ddp_comm_hook_future_passing_gpu_gloo(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_future_passing_gpu_nccl(self):
         """
         This unit test verifies whether the Future object is passed properly using nccl backend.
@@ -3176,9 +3438,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self):
         # without the comm_hook, result would be 0.25 * torch.ones(2, 2).
         self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2))
 
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_ddp_comm_hook_allreduce_hook_nccl(self):
+    def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False):
         """
         This unit test verifies whether a DDP communication hook that just calls
         allreduce gives the same result result with the case of no hook registered.
@@ -3193,13 +3453,26 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future:
             return process_group.allreduce(tensors).get_future()
 
         # Get GPU model with allreduce_hook registered.
-        gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook)
+        gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook, gradient_as_bucket_view)
 
         # check whether the grads are equal to what DDP without hook would return.
         self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_ddp_comm_hook_allreduce_hook_nccl(self):
+        self._test_ddp_comm_hook_allreduce_hook_nccl()
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self):
+        self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self):
         """
         This unit test verifies whether a DDP communication hook that calls allreduce and then
@@ -3243,7 +3516,7 @@ def test_ddp_invalid_comm_hook_init(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         with self.assertRaisesRegex(TypeError, "Communication hook must be callable."):
             model._register_comm_hook(state=None, hook=1)
@@ -3267,7 +3540,7 @@ def test_ddp_invalid_comm_hook_return_type(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         with self.assertRaisesRegex(
             ValueError,
@@ -3304,7 +3577,7 @@ def test_ddp_comm_hook_register_just_once(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         def dummy_hook(state, bucket):
             fut = torch.futures.Future()
@@ -3591,6 +3864,7 @@ def _run_all_reduce(self, pg):
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_nonblocking(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -3642,36 +3916,42 @@ def _test_nccl_errors_blocking(self, func):
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_clean_exit(self):
         self._test_nccl_errors_blocking(lambda: sys.exit(0))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_nonzero_exit(self):
         self._test_nccl_errors_blocking(lambda: sys.exit(1))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_abort(self):
         self._test_nccl_errors_blocking(lambda: os.abort())
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_sigkill(self):
         self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_sigterm(self):
         self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_blocking_wait_with_barrier(self):
         os.environ["NCCL_BLOCKING_WAIT"] = "1"
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -3694,6 +3974,7 @@ def _run_invalid_nccl_blocking_wait_env(self, val):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env('abc')
         self._run_invalid_nccl_blocking_wait_env('-1')
@@ -3743,7 +4024,10 @@ def test_nccl_timeout(self):
 class CommTest(MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         super(CommTest, self).tearDown()
@@ -3809,7 +4093,7 @@ def test_broadcast_coalesced_nccl(self):
     def test_broadcast_coalesced_gloo_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cuda:%d" % self.rank)
         ranks = list(range(self.world_size))
@@ -3820,7 +4104,7 @@ def test_broadcast_coalesced_gloo_cuda(self):
     def test_broadcast_coalesced_gloo_cpu(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cpu")
         ranks = list(range(self.world_size))
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index d0bf00b8a08a..c84608e8f178 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -10,8 +10,10 @@
 import torch.nn as nn
 
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_distributed import requires_gloo
-from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm
+from torch.testing._internal.common_distributed import requires_gloo, \
+    create_device
+from torch.testing._internal.common_utils import TestCase, load_tests, \
+    run_tests, skipIfRocm
 from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN
 
 
@@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase):
     @classmethod
     def opts(cls, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")]
+        opts.devices = [create_device(interface='lo')]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index dee5fd702b16..99a10906462a 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -775,6 +775,36 @@ def forward(self, x):
                         print("Caught exception during iterations at " + named_msg, flush=True)
                         raise
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_parameter_list_dict_replica(self):
+        class MyMod(torch.nn.Module):
+            def __init__(self, data):
+                super(MyMod, self).__init__()
+                self.data = data
+
+            def forward(self, inp):
+                return inp
+
+        p1 = torch.nn.Parameter(torch.rand(10))
+        p2 = torch.nn.Parameter(torch.rand(10))
+        module = MyMod(torch.nn.ParameterList([p1, p2])).cuda()
+        model = dp.DataParallel(module)
+        input = torch.randn((8, 8), device="cuda")
+
+        with self.assertWarnsRegex(
+                UserWarning,
+                r"nn\.ParameterList is being used with DataParallel but this"):
+            model(input)
+
+        module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2})).cuda()
+        model = dp.DataParallel(module)
+        input = torch.randn((8, 8), device="cuda")
+
+        with self.assertWarnsRegex(
+                UserWarning,
+                r"nn\.ParameterDict is being used with DataParallel but this"):
+            model(input)
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 968c797c9163..a2de582937aa 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -219,6 +219,7 @@ def observe(self, args):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
+        output_node : Optional[Node] = None
         for node in self.graph.nodes:
             if node.op == 'placeholder':
                 result = next(args_iter)
@@ -232,6 +233,8 @@ def load_arg(a):
                 result = getattr(self_obj, node.target)(*args, **kwargs)
             elif node.op == 'call_module':
                 result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'output':
+                return load_arg(node.args[0])
 
             env[node.name] = result
             root_node, obj = self.matches.get(node.name, (None, None))
@@ -240,7 +243,7 @@ def load_arg(a):
             if node.name in self.quants:
                 self.quants[node.name].observe(node, env)
 
-        return load_arg(self.graph.result)
+        raise RuntimeError('Graph had no output node!')
 
     def quantize(self):
         self.quantized_graph = Graph()
@@ -281,7 +284,6 @@ def load_or_emit(n):
                 else:
                     quant_env[node.name] = r
 
-        self.quantized_graph.output(load_arg(self.graph.result, quantized=False))
         return GraphModule(self.root, self.quantized_graph)
 
     def _find_matches(self, patterns):
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index e2eaa0b2a1e5..89330ddbd2d9 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -6,8 +6,14 @@
 import torch
 import torch._C
 from pathlib import Path
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, skipIfRocm, IS_SANDCASTLE, IS_WINDOWS, IS_MACOS
-
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    TEST_WITH_ROCM,
+    skipIfRocm,
+)
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
@@ -54,7 +60,7 @@ class JitBackendTestCase(JitTestCase):
 
     def setUp(self):
         super().setUp()
-        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
+        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
         torch_root = Path(__file__).resolve().parent.parent.parent
         p = torch_root / 'build' / 'lib' / 'libjitbackend_test.so'
@@ -101,7 +107,7 @@ def setUp(self):
         self.module = BasicModule()
         self.scripted_module = torch.jit.script(BasicModule())
         self.lowered_module = to_test_backend_multi(
-            self.scripted_module._c,
+            self.scripted_module,
             {"accum": {"": ""}, "sub_accum": {"": ""}, "forward": {"": ""}},
         )
 
@@ -161,7 +167,7 @@ def setUp(self):
         # Both modules in self.scripted_module are ScriptModules.
         self.scripted_module = torch.jit.script(NestedModuleTest.NestedModule(BasicModule()))
         lowered_module = to_test_backend_multi(
-            self.scripted_module._c, {"forward": {"": ""}}
+            self.scripted_module, {"forward": {"": ""}}
         )
         # self.lowered_module is a ScriptModule, but its submodule is a lowered module.
         self.lowered_module = torch.jit.script(NestedModuleTest.NestedModule(lowered_module))
diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index c71be6ac1d9f..7c9e323163e6 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -13,7 +13,7 @@
 from torch.testing._internal.jit_utils import JitTestCase
 import torch.testing._internal.jit_utils
 from torch.testing._internal.common_utils import IS_SANDCASTLE
-from typing import List, Tuple, Iterable
+from typing import List, Tuple, Iterable, Optional, Dict
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
@@ -1020,6 +1020,106 @@ def foo():
             y.my_list = new_list
             return y
 
+    def test_default_args(self):
+        """
+        Test that methods on class types can have default arguments.
+        """
+        @torch.jit.script
+        class ClassWithDefaultArgs:
+            def __init__(
+                self,
+                a: int = 1,
+                b: Optional[List[int]] = None,
+                c: Tuple[int, int, int] = (1, 2, 3),
+                d: Optional[Dict[int, int]] = None,
+                e: Optional[str] = None,
+            ):
+                self.int = a
+                self.tup = c
+                self.str = e
+
+                self.list = [1, 2, 3]
+                if b is not None:
+                    self.list = b
+
+                self.dict = {1: 2, 3: 4}
+                if d is not None:
+                    self.dict = d
+
+            def add(self, b: int, scale: float = 1.0) -> float:
+                return self.int * scale + b
+
+        def all_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs()
+            return obj.int + obj.list[2] + obj.tup[1]
+
+        def some_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(b=[5, 6, 7])
+            return obj.int + obj.list[2] + obj.dict[1]
+
+        def override_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(3, [9, 10, 11], (12, 13, 14), {3: 4}, "str")
+            s: int = obj.int
+
+            for x in obj.list:
+                s += x
+
+            for y in obj.tup:
+                s += y
+
+            s += obj.dict[3]
+
+            st = obj.str
+            if st is not None:
+                s += len(st)
+
+            return s
+
+        def method_defaults() -> float:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs()
+            return obj.add(3) + obj.add(3, 0.25)
+
+        self.checkScript(all_defaults, ())
+        self.checkScript(some_defaults, ())
+        self.checkScript(override_defaults, ())
+        self.checkScript(method_defaults, ())
+
+        # The constructor of this class below has some arguments without default values.
+        class ClassWithSomeDefaultArgs:  # noqa: B903
+            def __init__(
+                self,
+                a: int,
+                b: int = 1,
+            ):
+                self.a = a
+                self.b = b
+
+        def default_b() -> int:
+            obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1)
+            return obj.a + obj.b
+
+        def set_b() -> int:
+            obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1, 4)
+            return obj.a + obj.b
+
+        self.checkScript(default_b, ())
+        self.checkScript(set_b, ())
+
+        # The constructor of this class below has mutable arguments. This should throw
+        # an error.
+        class ClassWithMutableArgs:   # noqa: B903
+            def __init__(
+                self,
+                a: List[int] = [1, 2, 3],  # noqa: B006
+            ):
+                self.a = a
+
+        def should_fail():
+            obj: ClassWithMutableArgs = ClassWithMutableArgs()
+
+        with self.assertRaisesRegex(RuntimeError, "Mutable default parameters are not supported"):
+            torch.jit.script(should_fail)
+
     def test_staticmethod(self):
         """
         Test static methods on class types.
@@ -1067,6 +1167,8 @@ def free_function(x: int) -> int:
 
         @torch.jit.script
         class Properties(object):
+            __jit_unused_properties__ = ["unsupported"]
+
             def __init__(self, a: int):
                 self.a = a
 
@@ -1074,6 +1176,19 @@ def __init__(self, a: int):
             def attr(self) -> int:
                 return self.a - 1
 
+            @property
+            def unsupported(self) -> int:
+                return sum([self.a])
+
+            @torch.jit.unused
+            @property
+            def unsupported_2(self) -> int:
+                return sum([self.a])
+
+            @unsupported_2.setter
+            def unsupported_2(self, value):
+                self.a = sum([self.a])
+
             @attr.setter
             def attr(self, value: int):
                 self.a = value + 3
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index a242217a94c1..aa34c22413ad 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -267,6 +267,26 @@ def forward(self):
 
         self.assertEqual(scripted(), Color.RED.value)
 
+    def test_string_enum_as_module_attribute(self):
+        global Color
+
+        class Color(Enum):
+            RED = "red"
+            GREEN = "green"
+
+        class TestModule(torch.nn.Module):
+            def __init__(self, e: Color):
+                super(TestModule, self).__init__()
+                self.e = e
+
+            def forward(self):
+                return (self.e.name, self.e.value)
+
+        m = TestModule(Color.RED)
+        scripted = torch.jit.script(m)
+
+        self.assertEqual(scripted(), (Color.RED.name, Color.RED.value))
+
     def test_enum_return(self):
         global Color
 
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 2d2c404051f6..696b97059d19 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -237,8 +237,8 @@ def forward(self, x):
 
     def test_freeze_module_with_fork2(self):
         @torch.jit.script
-        def foo(x, y):
-            return x * y
+        def foo(x):
+            return x * 2
 
         class TestModule(nn.Module):
             def __init__(self):
@@ -247,8 +247,8 @@ def __init__(self):
                 self.b = torch.ones(20, 20)
 
             def forward(self, x):
-                fut = torch.jit._fork(foo, self.a, self.b)
-                y_hat = foo(self.a, self.b)
+                fut = torch.jit._fork(foo, self.a)
+                y_hat = foo(self.b)
                 y = torch.jit._wait(fut)
                 return y_hat + y
 
@@ -272,6 +272,50 @@ def forward(self, x):
         # conservatively assumes there is a mutation because attributes are
         # passed to fork subgraph. both 'a' and 'b' are preserved.
         self.assertTrue(mf.hasattr('a'))
+        self.assertFalse(mf.hasattr('b'))
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+    def test_freeze_module_with_fork_calling_module_method(self):
+        @torch.jit.script
+        def foo(x, y):
+            return x * y
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.a = torch.ones(20, 20)
+                self.b = torch.ones(20, 20)
+
+            @torch.jit.export
+            def foo(self, x):
+                return x * self.a
+
+            @torch.jit.export
+            def bar(self, x):
+                return x * self.b
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.foo, self.b)
+                y_hat = self.bar(self.a)
+                y = torch.jit._wait(fut)
+                return y_hat + y
+
+        m = torch.jit.script(TestModule())
+        m.eval()
+        input = torch.randn(2, 2)
+        output_s = m.forward(input)
+        mf = torch._C._freeze_module(m._c)
+        # Check if frozen module looks as below:
+        # module m {
+        #   attributes {
+        #     self.b = ..
+        #   }
+        #   ...
+        # TODO:  Although there are no mutation, the alias analysis
+        # conservatively assumes there is a mutation because attributes are
+        # passed to fork subgraph. 'b' is preserved.
+        self.assertFalse(mf.hasattr('a'))
         self.assertTrue(mf.hasattr('b'))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
@@ -480,6 +524,77 @@ def forward(self, x):
         self.assertEqual(output_s, output_f)
 
 
+    def test_freeze_module_with_preserve_sub_module(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = torch.tensor([1.1])
+                self.b = 2.2
+
+            def forward(self, x):
+                return self.a
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.sub1 = SubModule()  # aliasing
+                self.sub2 = SubModule()
+
+            def forward(self, x):
+                return self.sub2(x) + self.sub1(x)
+        m = TestModule()
+        ms = torch.jit.script(m)
+        ms.eval()
+        mf = torch._C._freeze_module(ms._c, ["sub1"])
+
+        # Test that 'sub1' is preserved entirely and 'sub2' is completely folded
+        self.assertTrue(mf.hasattr('sub1'))
+        self.assertTrue(mf.sub1.hasattr('a'))
+        self.assertTrue(mf.sub1.hasattr('b'))
+        self.assertFalse(mf.hasattr('sub2'))
+        input = torch.randn(2, 2)
+        output_s = ms.forward(input)
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+    def test_freeze_module_with_preserve_sub_module_and_mutation(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = torch.tensor([1.1])
+                self.b = 2.2
+
+            def forward(self, x):
+                self.a[0] = 3.3
+                return self.a
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.sub1 = SubModule()  # aliasing
+                self.sub2 = SubModule()
+
+            def forward(self, x):
+                return self.sub2(x) + self.sub1(x)
+        m = TestModule()
+        ms = torch.jit.script(m)
+        ms.eval()
+        mf = torch._C._freeze_module(ms._c, ["sub1"])
+
+        # Test that be both sub1 and sub1 are preserved and 'b' is preserved
+        # even if it is not used. To fulfill user request to preserve 'sub1'
+        self.assertTrue(mf.hasattr('sub1'))
+        self.assertTrue(mf.sub1.hasattr('a'))
+        self.assertTrue(mf.sub1.hasattr('b'))
+        self.assertTrue(mf.hasattr('sub2'))
+        self.assertTrue(mf.sub2.hasattr('a'))
+        self.assertTrue(mf.sub2.hasattr('b'))
+        input = torch.randn(2, 2)
+        output_s = ms.forward(input)
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+
     def test_freeze_module_with_helperfunction(self):
         class SubModule(nn.Module):
             def __init__(self):
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index a1c378963918..19e4952cad57 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -408,6 +408,43 @@ def test_over_slice():
             return a[3:10] == [3, 4]
         self.checkScript(test_backward_slice, ())
 
+    def test_slice_index(self):
+        a = torch.tensor(
+            [
+                [[1, 11], [2, 22]],
+                [[3, 33], [4, 44]],
+                [[5, 55], [6, 66]],
+            ]
+        )
+
+        def test_index_slice1(x):
+            x = x[:, :, [0, 1]]
+            return x
+        self.checkScript(test_index_slice1, (a,))
+
+        def test_index_slice2(x):
+            x = x[[2, 1, 0], :, :]
+            return x
+        self.checkScript(test_index_slice2, (a,))
+
+        def test_index_slice3(x):
+            x = x[[0, 1], :, [1]]
+            return x
+        self.checkScript(test_index_slice3, (a,))
+
+        def test_index_slice_empty_list(x):
+            empty_list: List[int] = []
+            x = x[empty_list, :, :]
+            return x
+        self.checkScript(test_index_slice_empty_list, (a,))
+
+        def test_index_slice_out_of_bounds_index(x):
+            x = x[[4], :, :]
+            return x
+        with self.assertRaisesRegex(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3"):
+            self.checkScript(test_index_slice_out_of_bounds_index, (a,))
+
+
     def test_mutable_list_append(self):
         def test_append():
             a = [0, 1]
@@ -1155,6 +1192,11 @@ def annotated_fn(x: torch.Tensor) -> List:
         with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
             torch.jit.script(annotated_fn)
 
+    def test_list_none(self):
+        with self.assertRaisesRegex(RuntimeError, "Can not create ListType with None type"):
+            x = torch._C.ListType(None)
+
+
 
 class TestDict(JitTestCase):
     def dict(self):
diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py
index 963c1ede8323..f06dafbc1ba2 100644
--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@@ -595,6 +595,58 @@ def forward(self, x):
         with self.assertRaisesRegex(RuntimeError, "failed to freeze interface attribute 'proxy_mod'"):
             mf = torch._C._freeze_module(m._c, freezeInterfaces = True)
 
+    def test_freeze_module_with_interface_and_fork(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.b = torch.tensor([1.5])
+
+            def forward(self, x):
+                self.b[0] += 3.2
+                return self.b
+
+        class OrigMod(torch.nn.Module):
+            def __init__(self):
+                super(OrigMod, self).__init__()
+                self.a = torch.tensor([0.5])
+
+            def forward(self, x):
+                return self.a
+
+        @torch.jit.interface
+        class ModInterface(torch.nn.Module):
+            def forward(self, x):
+                # type:  (Tensor) -> Tensor
+                pass
+
+        class TestModule(torch.nn.Module):
+            proxy_mod : ModInterface
+
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.proxy_mod = OrigMod()
+                self.sub = SubModule()
+
+            def forward(self, x):
+                y = self.proxy_mod(x);
+                z= self.sub(x)
+                return y + z
+
+        class MainModule(torch.nn.Module):
+            def __init__(self):
+                super(MainModule, self).__init__()
+                self.test= TestModule();
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.test.forward, x)
+                y = self.test(x)
+                z = torch.jit._wait(fut)
+                return y + z
+
+        m = torch.jit.script(MainModule())
+        m.eval()
+        mf = torch._C._freeze_module(m._c, freezeInterfaces = True)
+
     def test_module_apis_interface(self):
         @torch.jit.interface
         class ModuleInterface(nn.Module):
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 50d4351a4870..55604f5ff6bf 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -83,6 +83,7 @@ def test_fuse(a, b):
         # that guards a tensorexpr group
         optimized_block = next(g.findNode("prim::If").blocks())
         if_nodes = list(optimized_block.findAllNodes("prim::If"))
+
         self.assertEqual(len(if_nodes), 1)
         FileCheck().check("Group[Subgraph").run(str(if_nodes[0]))
         # no broadcasts occurred, sum_to_size have been specialized out
@@ -191,3 +192,24 @@ def foo(a, b):
 
         g = torch.jit.last_executed_optimized_graph()
         FileCheck().check("fallback_function").check_next("CallFunction").run(g)
+
+    def test_iterative_fusion(self):
+        @torch.jit.script
+        def foo(a, b, c, d):
+            a = a + b
+            b.add_(3)
+            c = c + b + d
+            a = a + 1
+            return a, c
+
+        x = torch.ones(1, requires_grad=False)
+        foo(x, x, x, x)
+        foo(x, x, x, x)
+
+        # when we iterate through the block, we will start
+        # by fusing a = a + b with a = a + 1
+        # if we were to continue iteration from that fusion point,
+        # would miss the fusion opportunity of c = c + d + b
+
+        g = torch.jit.last_executed_optimized_graph()
+        self.assertEqual(len(list(g.findAllNodes("prim::TensorExprGroup"))), 2)
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index ef408e775c33..b747fc06bcde 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -200,3 +200,44 @@ def intermediary_use():
         # it is possible to remove the append here but don't currently have the logic for it
         FileCheck().check_not("append").run(graph)
         self.assertEqual(intermediary_use(), fn())
+
+    def test_common_pytorch_list_ops(self):
+        for op in ["cat", "stack", "vstack", "hstack", "dstack"]:
+            class OpMod(torch.nn.Module):
+                def __init__(self, op):
+                    super(OpMod, self).__init__()
+                    self.op = torch_op
+
+                def forward(self):
+                    x = torch.tensor([1, 2, 3, 4])
+                    x.add_(3)
+                    y = [x, x]
+                    return self.op(y) + 3
+
+            torch_op = getattr(torch, op)
+            mod = OpMod(torch_op)
+            mod_script = torch.jit.script(mod)
+            self.run_pass('remove_mutation', mod_script.forward.graph)
+            FileCheck().check_not("aten::add_").run(mod_script.forward.graph)
+            self.assertEqual(mod(), mod_script())
+
+            # test that the output doesnt alias the input
+            for inputs in [torch.rand(2, 2)], [torch.rand(2, 2) for _ in range(2)]:
+                result = torch_op(inputs)
+                sums = [ten.sum() for ten in result]
+
+                for inp in inputs:
+                    inp.fill_(10)
+
+                self.assertEqual(sums, [ten.sum() for ten in result])
+
+
+        @torch.jit.script
+        def test_multiple_uses():
+            x = torch.tensor([1, 2, 3, 4])
+            x.add_(3)
+            y = [x, x]
+            return torch.cat(y), y
+
+        self.run_pass('remove_mutation', mod_script.forward.graph)
+        FileCheck().check("aten::add_").run(test_multiple_uses.graph)
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index a8bea73c984d..ee288b65551f 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -12,7 +12,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS, IS_FBCODE
 from torch.testing import FileCheck
 
 if __name__ == "__main__":
@@ -24,10 +24,14 @@
 
 class TestTorchbind(JitTestCase):
     def setUp(self):
-        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
+        if IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
-        torch_root = Path(__file__).resolve().parent.parent.parent
-        p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so'
+        if TEST_WITH_ROCM:
+            torch_root = Path(torch.__file__).resolve().parent
+            p = torch_root / 'lib' / 'libtorchbind_test.so'
+        else:
+            torch_root = Path(__file__).resolve().parent.parent.parent
+            p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so'
         torch.ops.load_library(str(p))
 
     def test_torchbind(self):
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 22921f7d684a..24db4cfe857e 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -18,6 +18,7 @@
     IS_SANDCASTLE, IS_WINDOWS
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
     _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, RUN_CUDA_MULTI_GPU
+from torch.testing._internal.common_cuda import with_tf32_off
 from typing import List, Tuple
 from torch import Tensor
 
@@ -900,6 +901,9 @@ def foo(a):
         self.assertEqual(foo(x), x + x + x)
 
     @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
+    # By default, on Ampere or later GPUs, nn.Linear computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_traced_module_cuda(self):
         class Model(nn.Module):
             def __init__(self, num_features, num_layers):
@@ -1310,6 +1314,39 @@ def check(mod):
         imported = self.getExportImportCopy(traced)
         check(imported.foo)
 
+        # Note that Bar's forward can only be traced, but not scripted
+        class Bar(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            @torch.jit.export
+            def addTwo(self, x):
+                return x + 2
+
+            def forward(self, input):
+                return (lambda a: a + 1)(input)
+
+        # When tracing Bar as a submodule, we only want to script the
+        # exported methods, and we want to keep the forwards still
+        # being traced.
+        class WrapperExports(torch.nn.Module):
+            def __init__(self):
+                super(WrapperExports, self).__init__()
+                self.bar = Bar()
+
+            @torch.jit.export
+            def addOne(self, x):
+                return x + 1
+
+            def forward(self, x):
+                return self.bar(x)
+
+        f = WrapperExports()
+
+        traced = torch.jit.trace(f, (torch.rand(3, 4),))
+        expected_names = ['addOne']
+        check(traced)
+
     def test_trace_autograd_function(self):
         class TestFunc(torch.autograd.Function):
             @staticmethod
diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py
new file mode 100644
index 000000000000..6a89ba4dc385
--- /dev/null
+++ b/test/jit/test_warn.py
@@ -0,0 +1,165 @@
+import os
+import sys
+import io
+
+import torch
+import warnings
+from contextlib import redirect_stderr
+from torch.testing import FileCheck
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from torch.testing._internal.jit_utils import JitTestCase
+
+if __name__ == '__main__':
+    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
+                       "\tpython test/test_jit.py TESTNAME\n\n"
+                       "instead.")
+
+
+class TestWarn(JitTestCase):
+    def test_warn(self):
+        @torch.jit.script
+        def fn():
+            warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_only_once(self):
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_only_once_in_loop_func(self):
+        def w():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                w()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_once_per_func(self):
+        def w1():
+            warnings.warn("I am warning you")
+
+        def w2():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            w1()
+            w2()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_once_per_func_in_loop(self):
+        def w1():
+            warnings.warn("I am warning you")
+
+        def w2():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                w1()
+                w2()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_multiple_calls_multiple_warnings(self):
+        @torch.jit.script
+        def fn():
+            warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_multiple_calls_same_func_diff_stack(self):
+        def warn(caller: str):
+            warnings.warn("I am warning you from " + caller)
+
+        @torch.jit.script
+        def foo():
+            warn("foo")
+
+        @torch.jit.script
+        def bar():
+            warn("bar")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            foo()
+            bar()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you from foo",
+                count=1,
+                exactly=True) \
+            .check_count(
+                str="UserWarning: I am warning you from bar",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index 15e1362ea722..ffd0631639f6 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -359,6 +359,7 @@ def test_with_exceptions(self):
         Check that exceptions thrown in the bodies of with-statements are
         handled correctly.
         """
+        global Context
 
         @torch.jit.script
         class Context(object):
@@ -379,10 +380,12 @@ def __enter__(self):
             def __exit__(self, type: Any, value: Any, tb: Any):
                 self.count.sub_(0.3)
 
+        @torch.jit.script
         def method_that_raises():
             # type: () -> Tensor
-            raise Exception()
+            raise Exception("raised exception")
 
+        @torch.jit.script
         def test_exception(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -393,6 +396,7 @@ def test_exception(x, c):
 
             return x
 
+        @torch.jit.script
         def test_exception_nested(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -404,6 +408,7 @@ def test_exception_nested(x, c):
 
             return x
 
+        @torch.jit.script
         def with_that_raises(c):
             # type: (Context) -> Tensor
             a = torch.tensor([1])
@@ -413,6 +418,7 @@ def with_that_raises(c):
 
             return a
 
+        @torch.jit.script
         def test_exception_fn_call(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -426,15 +432,18 @@ def test_exception_fn_call(x, c):
 
         c = Context(1)
 
-        with self.assertRaises(Exception):
+        # checkScript and checkScriptRaisesRegex cannot be used because the string frontend will
+        # not compile class types (of which Context, the context manager being used for this test
+        # is one).
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaises(Exception):
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception_nested(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaises(Exception):
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception_fn_call(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
diff --git a/test/module_a.py b/test/module_a.py
new file mode 100644
index 000000000000..685af9bc1569
--- /dev/null
+++ b/test/module_a.py
@@ -0,0 +1 @@
+result = 'module_a'
diff --git a/torch/csrc/jit/tensorexpr/buffer.cpp b/test/namespace_b/subpackage.py
similarity index 100%
rename from torch/csrc/jit/tensorexpr/buffer.cpp
rename to test/namespace_b/subpackage.py
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
index cde473fcdb4d..1479846789d4 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
index 58d8c805163d..f5cfba35b032 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
index 10d47a6ed84d..8b0ec04b24c8 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "none"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
index 6ccab9f7b50f..8d3539ca1c64 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
index 1ea4adac8cab..bf1667b58812 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
@@ -9,6 +9,11 @@ graph {
     output: "3"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
index 75202b5d0da2..abd2276e7716 100644
--- a/test/onnx/expect/TestOperators.test_view.expect
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -3,16 +3,26 @@ producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "0"
     output: "1"
-    name: "Flatten_0"
-    op_type: "Flatten"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axis"
-      i: 1
-      type: INT
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    name: "Reshape_1"
+    op_type: "Reshape"
+  }
   name: "torch-jit-export"
   input {
     name: "0"
@@ -28,7 +38,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 07667797e2cf..5ae9c0576c7a 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -65,60 +65,40 @@ graph {
     }
   }
   node {
-    input: "6"
     output: "7"
-    name: "Cast_6"
-    op_type: "Cast"
-    attribute {
-      name: "to"
-      i: 11
-      type: INT
-    }
-  }
-  node {
-    output: "8"
-    name: "Constant_7"
+    name: "Constant_6"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\000\360?"
+        data_type: 7
+        raw_data: "\030\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "8"
     input: "7"
-    output: "9"
-    name: "Div_8"
+    input: "6"
+    output: "8"
+    name: "Div_7"
     op_type: "Div"
   }
   node {
-    output: "10"
-    name: "Constant_9"
-    op_type: "Constant"
+    input: "8"
+    output: "9"
+    name: "Cast_8"
+    op_type: "Cast"
     attribute {
-      name: "value"
-      t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\0008@"
-      }
-      type: TENSOR
+      name: "to"
+      i: 7
+      type: INT
     }
   }
   node {
     input: "9"
-    input: "10"
-    output: "11"
-    name: "Mul_10"
-    op_type: "Mul"
-  }
-  node {
-    input: "11"
-    output: "12"
-    name: "Cast_11"
+    output: "10"
+    name: "Cast_9"
     op_type: "Cast"
     attribute {
       name: "to"
@@ -128,8 +108,8 @@ graph {
   }
   node {
     input: "3"
-    output: "13"
-    name: "Unsqueeze_12"
+    output: "11"
+    name: "Unsqueeze_10"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -138,9 +118,9 @@ graph {
     }
   }
   node {
-    input: "12"
-    output: "14"
-    name: "Unsqueeze_13"
+    input: "10"
+    output: "12"
+    name: "Unsqueeze_11"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -149,10 +129,10 @@ graph {
     }
   }
   node {
-    input: "13"
-    input: "14"
-    output: "15"
-    name: "Concat_14"
+    input: "11"
+    input: "12"
+    output: "13"
+    name: "Concat_12"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -162,9 +142,9 @@ graph {
   }
   node {
     input: "0"
-    input: "15"
-    output: "16"
-    name: "Reshape_15"
+    input: "13"
+    output: "14"
+    name: "Reshape_13"
     op_type: "Reshape"
   }
   name: "torch-jit-export"
@@ -191,7 +171,7 @@ graph {
     }
   }
   output {
-    name: "16"
+    name: "14"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index 6f37fa6d7e72..f91f6bea165b 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -49,7 +49,6 @@ class TestModels(TestCase):
     opset_version = _export_onnx_opset_version
 
     def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7):
-        self.is_script_test_enabled = True
         with torch.onnx.select_model_mode_for_export(model, None):
             graph = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX)
             torch._C._jit_pass_lint(graph)
@@ -94,14 +93,12 @@ def test_srresnet(self):
         self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x))
 
     @skipIfNoLapack
-    @disableScriptTest()
     def test_super_resolution(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0)
         )
         self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6)
 
-    @disableScriptTest()
     def test_alexnet(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
@@ -137,13 +134,12 @@ def test_vgg19_bn(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(vgg19_bn()), toC(x))
 
-    @disableScriptTest()
     def test_resnet(self):
         # ResNet50 model
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(resnet50()), toC(x), atol=1e-6)
 
-    @disableScriptTest()
+    @disableScriptTest()  # None type in outputs
     def test_inception(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 3, 299, 299) + 1.)
@@ -208,22 +204,20 @@ def test_qat_resnet(self):
 
         self.exportTest(toC(qat_resnet50), toC(x))
 
-    @disableScriptTest()
+    @disableScriptTest()  # None type in outputs
     def test_googlenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(googlenet()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mnasnet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(mnasnet1_0()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mobilenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(mobilenet_v2()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
+    @disableScriptTest()  # prim_data
     def test_shufflenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(shufflenet_v2_x1_0()), toC(x), rtol=1e-3, atol=1e-5)
@@ -238,20 +232,18 @@ def test_deeplab(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(deeplabv3_resnet101()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_r3d_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(r3d_18()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mc3_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(mc3_18()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_r2plus1d_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(r2plus1d_18()), toC(x), rtol=1e-3, atol=1e-5)
 
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 657a1479723d..c916b60844d1 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -15,13 +15,31 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):
                        input=inputs, rtol=rtol, atol=atol)
 
         if self.is_script_test_enabled and opset_version > 11:
+            TestModels.use_new_jit_passes = True
+            TestModels.onnx_shape_inference = True
+
             outputs = model(inputs)
             script_model = torch.jit.script(model)
             run_model_test(self, script_model, False, example_outputs=outputs,
-                           input=inputs, rtol=rtol, atol=atol, use_new_jit_passes=True)
+                           input=inputs, rtol=rtol, atol=atol)
+
+
+TestModels = type(str("TestModels"),
+                  (unittest.TestCase,),
+                  dict(TestModels.__dict__,
+                       is_script_test_enabled=False,
+                       exportTest=exportTest))
+
+
+# model tests for scripting with new JIT APIs and shape inference
+TestModels_new_jit_API = type(str("TestModels_new_jit_API"),
+                              (unittest.TestCase,),
+                              dict(TestModels.__dict__,
+                                   exportTest=exportTest,
+                                   is_script_test_enabled=True,
+                                   use_new_jit_passes=True,
+                                   onnx_shape_inference=True))
 
 
 if __name__ == '__main__':
-    TestModels.is_script_test_enabled = True
-    TestModels.exportTest = exportTest
     unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 816951dfc79e..23d4879a8a4c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -15,6 +15,7 @@
                                  skipIfUnsupportedMaxOpsetVersion, skipIfONNXShapeInference)
 from test_pytorch_common import BATCH_SIZE
 from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
+from typing import List
 import model_defs.word_language_model as word_language_model
 import torchvision
 import onnx
@@ -189,6 +190,7 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
                 ort_outs = run_ort(ort_sess, input_copy)
                 ort_compare_with_pytorch(ort_outs, output, rtol, atol)
 
+
     @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
     def test_embedding_model_with_external_data(self):
         class LargeModel(torch.nn.Module):
@@ -315,7 +317,7 @@ def run_word_language_model(self, model_name):
         self.run_test(model, (x, model.hidden))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Faster RCNN model is not scriptable
     def test_faster_rcnn(self):
         model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200,
                                                                                  max_size=300)
@@ -380,27 +382,53 @@ def test_word_language_model_LSTM(self):
     def test_word_language_model_GRU(self):
         self.run_word_language_model("GRU")
 
-    @disableScriptTest()
     def test_index_1d(self):
-        self._test_index_generic(lambda input: input[0])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_1dimslice(self):
-        self._test_index_generic(lambda input: input[0:1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0:1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_sliceint(self):
-        self._test_index_generic(lambda input: input[1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_neg_slice(self):
-        self._test_index_generic(lambda input: input[0:-1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0:-1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_index_mask(self):
-        self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.uint8)])
-        self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.bool)])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[torch.tensor([0, 1, 0], dtype=torch.uint8)]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[torch.tensor([0, 1, 0], dtype=torch.bool)]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
     @disableScriptTest()
     def test_dict(self):
@@ -612,6 +640,20 @@ def forward(self, input1, input2, input3):
         self.run_test(TraceModel(), (x1, x2, x3), atol=10e-5)
         self.run_test(ScriptModel(), (x1, x2, x3), atol=10e-5)
 
+    def test_conv_shape_inference(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+
+            def forward(self, input):
+                return self.conv2(input) + 2
+
+        x = torch.randn(20, 16, 50, 100)
+        self.run_test(Model(), x, atol=10e-5,
+                      input_names=['x'],
+                      dynamic_axes={'x': [0]})
+
     def test_conv_transpose(self):
         class TraceModel(torch.nn.Module):
             def __init__(self):
@@ -660,14 +702,18 @@ def forward(self, x):
 
     def squeeze_model_tests(self, d, x1, x2):
         class Squeeze(torch.nn.Module):
+            def __init__(self, d):
+                super(Squeeze, self).__init__()
+                self.d = d
+
             def forward(self, x):
-                if d is not None:
-                    return torch.squeeze(x, dim=d)
+                if self.d is not None:
+                    return torch.squeeze(x, dim=self.d)
                 else:
                     return torch.squeeze(x)
 
         x2 = [] if x2 is None else [x2]
-        self.run_test(Squeeze(), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2)
+        self.run_test(Squeeze(d), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2)
 
     def test_squeeze_without_no_op(self):
         x = torch.randn(2, 1, 4)
@@ -761,7 +807,7 @@ def test_maxpool_3d_ceil(self):
         self.run_test(model, x)
 
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
+    @disableScriptTest()  # Functional module not scriptable
     def test_maxpool_with_indices(self):
         model = torch.nn.MaxPool1d(2, stride=1, return_indices=True)
         x = torch.randn(20, 16, 50)
@@ -814,7 +860,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(ArithmeticModule(), x)
 
-    @disableScriptTest()
     # In scripting the first transpose node do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
     @skipIfONNXShapeInference(False)
@@ -868,7 +913,7 @@ def forward(self, x):
     def test_div(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
-                return x / y
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -882,7 +927,7 @@ def forward(self, x, y):
     def test_div_promotion_trace(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
-                return x / y
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -900,14 +945,14 @@ def forward(self, x, y):
     # In scripting x, y do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
     @skipIfONNXShapeInference(False)
-    def test_true_div_script(self):
-        class TrueDivModule(torch.nn.Module):
+    def test_div_promotion_script(self):
+        class DivModule(torch.nn.Module):
             def forward(self, x, y):
                 # Add transpose to hide shape/type information
                 # Otherwise shape and type are still avaiable from input.
                 x = x.transpose(1, 2)
                 y = y.transpose(1, 2)
-                return torch.true_divide(x, y)
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -918,20 +963,20 @@ def forward(self, x, y):
         #    This can be handled by the default case, where both are cast to float.
         #    It works even if type of x, y are unknown.
         torch.set_default_dtype(torch.float)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
         # 2. x,y are int, and output is double.
         #    This can be handled by the default case, where both are cast to double.
         #    It works even if type of x, y are unknown.
         torch.set_default_dtype(torch.double)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
         # 3. x is int, y is double, and output is double.
         #    This can only be handled when both type of x and y are known.
         torch.set_default_dtype(prev_default)
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.double)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
     def test_slice_trace(self):
         class MyModule(torch.nn.Module):
@@ -977,7 +1022,7 @@ def forward(self, x, y):
         self.run_test(InputIndexSlice(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
+    @disableScriptTest()  # scripting tuple/list append
     def test_slice_dynamic(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
@@ -1014,7 +1059,7 @@ def forward(self, x):
         self.run_test(DynamicSliceModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
+    @disableScriptTest()   # scripting tuple/list append
     def test_slice_dynamic_to_end(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
@@ -1121,7 +1166,7 @@ def forward(self, input):
         self.run_test(SizeModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # x.stride() not scriptable
     def test_as_strided(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -1134,28 +1179,42 @@ def forward(self, x):
         x = torch.randn(5, 8, 7)
         self.run_test(Model(), x)
 
-    def _test_index_generic(self, fn):
+    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
+    def test_tensor_index_advanced_indexing_ellipsis(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
             def forward(self, input):
-                return fn(input)
+                return input[..., torch.tensor([2, 1]), torch.tensor([0, 3])]
 
         m1 = torch.randn(3, 4, 5, 6, 7)
-        self.run_test(MyModel(), m1)
+        self.run_test(MyModel(), (m1,))
 
-    @disableScriptTest()
     def test_tensor_index_advanced_indexing(self):
-        self._test_index_generic(
-            lambda input: input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])])
-        self._test_index_generic(lambda input: input[..., torch.tensor([2, 1]), torch.tensor([0, 3])])
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])])
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), (m1,))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])]
+
+        self.run_test(MyModel(), (m1,))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])]
+
+        self.run_test(MyModel(), (m1,))
 
-    @disableScriptTest()
     def test_tensor_index_advanced_indexing_consecutive(self):
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), (m1,))
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put(self):
@@ -1181,7 +1240,6 @@ def forward(self, x, ind, update):
         self.run_test(IndexPutModel(), (x, ind, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
     def test_index_put_slice_index(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1256,7 +1314,7 @@ def forward(self, x, update):
         self.run_test(IndexPutModel8(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_index_put_ellipsis(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1277,7 +1335,6 @@ def forward(self, x, update):
         self.run_test(IndexPutModel2(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
     def test_copy_(self):
         class CopyModel(torch.nn.Module):
             def forward(self, x, data):
@@ -1319,9 +1376,6 @@ def forward(self, x, data):
         update = torch.randn(2)
         self.run_test(CopyModel3(), (x, update))
 
-        update = torch.randn(1, 2)
-        self.run_test(CopyModel3(), (x, update))
-
         class CopyModel4(torch.nn.Module):
             def forward(self, x, ind, data):
                 x[ind] = data
@@ -1333,7 +1387,18 @@ def forward(self, x, ind, data):
         self.run_test(CopyModel4(), (x, ind, data))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Model not scriptable (output with shape doesn't match the broadcast shape)
+    def test_copy_tracing(self):
+        class CopyModel(torch.nn.Module):
+            def forward(self, x, data):
+                x[1, 1:3] = data
+                return x
+
+        x = torch.randn(3, 4)
+        update = torch.randn(1, 2)
+        self.run_test(CopyModel(), (x, update))
+
+    @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_ellipsis(self):
         class CopyModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1348,14 +1413,18 @@ def forward(self, x, update):
         update = torch.ones(1)
         self.run_test(CopyModel(), (x, update))
 
-        class CopyModel2(torch.nn.Module):
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # Missing input size (with ellipsis indexing)
+    def test_copy_ellipsis_tracing(self):
+        class CopyModel(torch.nn.Module):
             def forward(self, x, update):
                 x[2, ..., 1:3] = update
                 return x
 
         x = torch.randn(3, 4, 5, 6)
+
         update = torch.ones(1)
-        self.run_test(CopyModel2(), (x, update))
+        self.run_test(CopyModel(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_flip(self):
@@ -1381,8 +1450,8 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Rand(), x)
 
-    @disableScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()  # symbolic update for randn
     def test_random_dynamic_size(self):
         class RandN(torch.nn.Module):
             def forward(self, x):
@@ -1415,7 +1484,6 @@ def forward(self, x):
         self.run_test(RandLike(), x)
         self.run_test(torch.jit.script(RandLike()), x)
 
-    @disableScriptTest()
     def test_random_like_dtype(self):
         class RandNLike(torch.nn.Module):
             def forward(self, x):
@@ -1711,6 +1779,15 @@ def forward(self, input):
         x = torch.randn(3, 3, requires_grad=True)
         self.run_test(NarrowModel(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_narrow_dynamic(self):
+        class NarrowModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.narrow(input, 0, 0, input.shape[0] - 1)
+
+        x = torch.randn(3, 3, requires_grad=True)
+        self.run_test(NarrowModel(), x)
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_fill(self):
         class IndexFillModel(torch.nn.Module):
@@ -1758,7 +1835,6 @@ def forward(self, x):
         x = torch.randn(3, 4)
         self.run_test(IndexSelectScalerIndexModel(), x)
 
-    @disableScriptTest()
     def test_index_select_scaler_index(self):
         class IndexSelectScalerIndexModel(torch.nn.Module):
             def __init__(self, index_base):
@@ -1817,7 +1893,6 @@ def forward(self, x, k):
         self.run_test(MyModuleDynamic(), [x, k])
 
     @skipIfUnsupportedOpsetVersion([7])
-    @disableScriptTest()
     def test_normalize(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -1954,7 +2029,6 @@ def forward(self, input, indices):
         self.run_test(GatherModel(), input=(input, indices))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_expand(self):
         class ExpandModel(torch.nn.Module):
             def forward(self, input):
@@ -1975,7 +2049,7 @@ def forward(self, input, size):
                 return input.expand(size)
 
         input = torch.randn(3,)
-        size = torch.tensor([-1])
+        size = torch.tensor(-1)
         self.run_test(ExpandTensorSizeModel(), input=(input, size))
 
     def test_multinomial(self):
@@ -2105,6 +2179,7 @@ def test_logsoftmax_dim(self):
             self.run_test(model, input)
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()  # scripting prim_dtype
     def test_lstm_no_hidden(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
@@ -2134,7 +2209,7 @@ def test_lstm_default_init_state(self):
         self.run_test(model, input)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # LSTMModel model not scriptable
     def test_lstm_fixed_batch_size(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
@@ -2588,6 +2663,17 @@ def forward(self, input, other):
         shape = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, shape))
 
+    def test_view_dynamic_zero_dim(self):
+        class ViewModel(torch.nn.Module):
+            def forward(self, input):
+                input = input.view(-1, 2)
+                return input.view(1, -1)
+
+        x = torch.ones(2)
+        another_x = torch.empty((0,))
+        self.run_test(ViewModel(), x, test_with_inputs=[another_x],
+                      input_names=['input_1'], dynamic_axes={'input_1': [0, ]})
+
     def test_view_as(self):
         class ViewModel(torch.nn.Module):
             def forward(self, input, other):
@@ -2597,7 +2683,7 @@ def forward(self, input, other):
         y = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, y))
 
-    @disableScriptTest()
+    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm(self):
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1)
         x = torch.randn(3, 4, 5, requires_grad=True)
@@ -2615,7 +2701,7 @@ def test_weight_norm(self):
         x = torch.randn(3, 3, 5, requires_grad=True)
         self.run_test(model, x)
 
-    @disableScriptTest()
+    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm_nodim(self):
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None)
         x = torch.randn(3, 4, 5, requires_grad=True)
@@ -2645,7 +2731,6 @@ def forward(self, x):
         x = torch.randint(10, (1, 2, 3, 4))
         self.run_test(FlattenModel(), x)
 
-    @disableScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_flatten_dynamic_axes(self):
         class MyModule(torch.nn.Module):
@@ -2680,11 +2765,11 @@ def forward(self, x, y, z, ind):
         ind = torch.tensor(-2, dtype=torch.long)
         self.run_test(GetItemModel(), (x, y, z, ind))
 
-    @disableScriptTest()
     def test_unbind(self):
         class UnbindModel(torch.nn.Module):
             def forward(self, input):
-                return input.unbind()
+                _, out, _ = input.unbind()
+                return out
 
         x = torch.randn(3, 4, 5)
         self.run_test(UnbindModel(), x)
@@ -2721,7 +2806,7 @@ def test_len_list(self):
         class LenListModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, input):
-                return torch.ones(len(input.shape)) 
+                return torch.ones(len(input.shape))
 
         x = torch.randn(4, 5)
         self.run_test(LenListModel(), x)
@@ -2744,18 +2829,19 @@ def forward(self, input):
         x = torch.randn(3, 4, 5)
         self.run_test(UnbindModel2(), x)
 
-    @disableScriptTest()
     def test_split(self):
         class SplitModel(torch.nn.Module):
             def forward(self, input):
-                return input.split([2, 1, 2])
+                out1, out2, out3 = input.split([2, 1, 2])
+                return out1, out2, out3
 
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel(), x)
 
         class SplitModel2(torch.nn.Module):
             def forward(self, input):
-                return input.split([2, 1, 1], -2)
+                out1, out2, out3 = input.split([2, 1, 1], -2)
+                return out1, out2, out3
 
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel2(), x)
@@ -2772,18 +2858,20 @@ def forward(self, input):
     @disableScriptTest()
     def test_split_size_as_list(self):
         class SplitModel(torch.nn.Module):
-            def forward(self, input):
+            def forward(self, input, split_sizes: List[int]):
                 out = []
-                split_sizes = [input.shape[0] - 1, 1]
-                for ob in input.split(split_sizes):
+                split_list: List[torch.Tensor] = input.split(split_sizes)
+
+                for ob in split_list:
                     out.append(ob)
                 return torch.cat(out, dim=0)
 
-        x = torch.randn(5, 4, 3)
-        self.run_test(SplitModel(), x)
+        x = torch.randn(6, 4, 3)
+        split_sizes = [torch.tensor(2), torch.tensor(4)]
+        self.run_test(SplitModel(), (x, split_sizes))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    def test_split_size_list_to_slice(self):
+    def test_split_size_with_slice(self):
         class SplitModule(torch.nn.Module):
             def forward(self, x, y, t):
                 splits = (x.size(1), y.size(1))
@@ -2991,7 +3079,6 @@ def forward(self, x):
         self.run_test(Zero_(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_new_zeros(self):
         class Zero_(torch.nn.Module):
             def forward(self, x):
@@ -3056,6 +3143,17 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Full(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_inplace_list(self):
+        class Arithmetic(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x, y):
+                return torch.cat([x.add_(3), y.fill_(0)])
+
+        x = torch.randn(2, 3)
+        y = torch.randn(2, 3)
+        self.run_test(Arithmetic(), (x, y))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_fill(self):
         class Fill_(torch.nn.Module):
@@ -3119,6 +3217,28 @@ def forward(self, x):
         x = torch.arange(16).view(2, 2, 4).to(torch.float32)
         self.run_test(MaskedFillModel2(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_masked_fill_inplace(self):
+
+        class MaskedFillModel(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                mask = torch.tensor([[0, 0, 1], [1, 1, 0]], dtype=torch.uint8)
+                x.masked_fill_(mask, 2)
+                return x
+
+        x = torch.zeros(4, 2, 3, requires_grad=True)
+        self.run_test(MaskedFillModel(), x)
+
+        class MaskedFillModel2(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                x.masked_fill_(x > 3, -1)
+                return x
+
+        x = torch.arange(16).view(2, 2, 4).to(torch.float32)
+        self.run_test(MaskedFillModel2(), x)
+
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_masked_scatter(self):
         class MaskedScatterModel(torch.nn.Module):
@@ -3147,7 +3267,6 @@ def forward(self, x):
         self.run_test(PixelShuffle(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_scalar_type(self):
         class ArithmeticModel(torch.nn.Module):
             def forward(self, x):
@@ -3194,7 +3313,7 @@ def forward(self, x):
         self.run_test(FullModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_full_like(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x):
@@ -3204,7 +3323,7 @@ def forward(self, x):
         self.run_test(FullLikeModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_full_like_value(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x, y):
@@ -3378,28 +3497,9 @@ def forward(self, input):
         x = torch.tensor([False, True, True])
         self.run_test(model, x)
 
-    @unittest.skip("Enable once jit trace Tensor.numel as constant is fixed.")
-    def test_embedding_bag_dynamic(self):
-        class EmbeddingModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.embeddingbag = torch.nn.EmbeddingBag(40, 12, mode='sum')
-
-            def forward(self, input):
-                return self.embeddingbag(input)
-
-        model = EmbeddingModel()
-        x = torch.randint(7, (10, 5))
-        y = torch.randint(10, (20, 5))
-        self.run_test(model, x, test_with_inputs=[y],
-                      input_names=['input'],
-                      output_names=['output'],
-                      dynamic_axes={'input': [0],
-                                    'output': [0]
-                                    })
-
-    @disableScriptTest()
+    @disableScriptTest()  # error in propagate as assign input shape
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag(self):
         model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True)
         input = torch.randint(10, (7,))
@@ -3415,27 +3515,29 @@ def test_embedding_bag(self):
         input = torch.randint(10, (7, 5))
         self.run_test(model, (input))
 
-    @disableScriptTest()
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_1d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, offset, weights):
-                return torch.nn.functional.embedding_bag(embedding_matrix, input, offsets=offset,
+                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offset,
                                                          mode='sum', per_sample_weights=weights)
 
         model = EmbeddingModel()
         x = torch.randint(7, (6,))
-        w = torch.randn(6,)
+        w = torch.randn(6, )
         offset = torch.tensor([0, 2, 5])
         embedding_matrix = torch.rand(10, 15)
         self.run_test(model, (embedding_matrix, x, offset, w))
 
-    @disableScriptTest()
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
-                return torch.nn.functional.embedding_bag(embedding_matrix, input,
+                return torch.nn.functional.embedding_bag(input, embedding_matrix,
                                                          mode='sum', per_sample_weights=weights)
 
         embedding_matrix = torch.rand(10, 15)
@@ -3444,12 +3546,52 @@ def forward(self, embedding_matrix, input, weights):
         w = torch.randn(2, 3)
         self.run_test(model, (embedding_matrix, x, w))
 
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @unittest.skip("Due to ONNX Loop shape inference issue.")
+    def test_embedding_bag_dynamic_input(self):
+        class EmbeddingModel1D(torch.nn.Module):
+            def forward(self, embedding_matrix, input, weights, offsets):
+                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offsets,
+                                                         mode='sum', per_sample_weights=weights)
+
+        model = EmbeddingModel1D()
+        x = torch.randint(7, (6,))
+        w = torch.randn(6, )
+        offsets = torch.tensor([0, 2, 5], dtype=torch.long)
+        embedding_matrix = torch.rand(10, 15)
+        x2 = torch.randint(7, (2,))
+        w2 = torch.randn(2, )
+        embedding_matrix2 = torch.rand(12, 25)
+        offsets2 = torch.tensor([0, ], dtype=torch.long)
+        self.run_test(model, (embedding_matrix, x, w, offsets),
+                      test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)],
+                      input_names=['embedding_matrix', 'x', 'offsets', 'w'],
+                      dynamic_axes={'embedding_matrix': [0, 1], 'x': [0], 'offsets': [0], 'w': [0]})
+
+        class EmbeddingModel2D(torch.nn.Module):
+            def forward(self, embedding_matrix, input, weights):
+                return torch.nn.functional.embedding_bag(input, embedding_matrix,
+                                                         mode='sum', per_sample_weights=weights)
+
+        model = EmbeddingModel2D()
+        x = torch.randint(7, (2, 3))
+        w = torch.randn(2, 3)
+        embedding_matrix = torch.rand(10, 15)
+        x2 = torch.randint(7, (3, 5))
+        w2 = torch.randn(3, 5)
+        embedding_matrix2 = torch.rand(12, 25)
+        self.run_test(model, (embedding_matrix, x, w),
+                      test_with_inputs=[(embedding_matrix2, x2, w2)],
+                      input_names=['embedding_matrix', 'x', 'w'],
+                      dynamic_axes={'embedding_matrix': [0, 1], 'x': [0, 1], 'w': [0, 1]})
+
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
     def test_meshgrid(self):
         class Meshgrid(torch.nn.Module):
             def forward(self, x, y, z):
-                return torch.meshgrid(x, y, z)
+                output1, output2, output3 = torch.meshgrid(x, y, z)
+                return output1, output2, output3
 
         x = torch.randn(3, requires_grad=True)
         y = torch.zeros(4, requires_grad=True)
@@ -3457,11 +3599,11 @@ def forward(self, x, y, z):
         self.run_test(Meshgrid(), (x, y, z))
 
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
     def test_meshgrid_scalar(self):
         class Meshgrid(torch.nn.Module):
             def forward(self, x, y, z):
-                return torch.meshgrid(x, y, z)
+                output1, output2, output3 = torch.meshgrid(x, y, z)
+                return output1, output2, output3
 
         x = torch.ones(3, requires_grad=True)
         y = torch.zeros(4, requires_grad=True)
@@ -3532,7 +3674,6 @@ def forward(self, input, other):
         self.run_test(model, (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_ones_bool(self):
         class MyModule(torch.nn.Module):
             def forward(self, input):
@@ -3579,7 +3720,7 @@ def test_constant_pad(self):
 
     # Dynamic padding is added in opset 11
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Functional module not scriptable
     def test_pad_types(self):
         # Test for different pad integer types
         class Pad(torch.nn.Module):
@@ -3613,7 +3754,7 @@ def run():
         self.assertEqual('Unsupported: ONNX export of Pad in opset 9. The sizes of the padding must be constant. ' +
                          'Please try opset version 11.', the_exception.args[0])
 
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_reflection_pad(self):
         model = torch.nn.ReflectionPad1d(2)
         x = torch.randn(2, 4, 4)
@@ -3623,7 +3764,7 @@ def test_reflection_pad(self):
         x = torch.randn(2, 2, 4, 4)
         self.run_test(model, x)
 
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_replication_pad(self):
         model = torch.nn.ReplicationPad1d(2)
         x = torch.randn(2, 4, 4)
@@ -3634,7 +3775,7 @@ def test_replication_pad(self):
         self.run_test(model, x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_im2col(self):
         class Unfold(torch.nn.Module):
             def forward(self, input):
@@ -3658,7 +3799,6 @@ def forward(self, x):
     # This test checks output scalar type in the ONNX graph should not be null
     # https://github.com/pytorch/pytorch/issues/28607
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
     def test_trace_script(self):
         @torch.jit.script
         def center_slice_helper(input, h_offset):
@@ -3688,13 +3828,14 @@ def forward(self, input):
                 out = input * 2
                 out *= out.dim()
                 return out
+
         empty_input = torch.randn(0, requires_grad=True)
         multi_dim_input = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(DimModel(), empty_input)
         self.run_test(DimModel(), multi_dim_input)
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # variable number of inputs not scriptable
     def test_einsum(self):
         class EinsumModelBatchDiagonal(torch.nn.Module):
             def forward(self, *tensor_list):
@@ -3731,142 +3872,107 @@ def forward(self, *tensor_list):
         self.run_test(EinsumModelTranspose(), input=(x,))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_crossentropyloss(self):
-        x = torch.randn(3, 5)
-        y = torch.empty(3, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+        for ignore_index in [-100, 1]:
+            x = torch.randn(3, 5)
+            y = torch.empty(3, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
 
-        x = torch.randn(3, 5, 2)
-        y = torch.empty(3, 2, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+            self._crossentropyloss(x, y, ignore_index)
 
-        x = torch.randn(3, 5, 2, 7)
-        y = torch.empty(3, 2, 7, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+            x = torch.randn(3, 5, 2)
+            y = torch.empty(3, 2, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
+            self._crossentropyloss(x, y, ignore_index)
 
-    def _crossentropyloss(self, x, y):
+            x = torch.randn(3, 5, 2, 7)
+            y = torch.empty(3, 2, 7, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
+            self._crossentropyloss(x, y, ignore_index)
+
+    def _crossentropyloss(self, x, y, ignore_index):
         class CrossEntropyLossNone(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossNone, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none')
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none')
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossNone(), input=(x, y))
+        self.run_test(CrossEntropyLossNone(ignore_index), input=(x, y))
 
         class CrossEntropyLossNoneWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossNoneWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5))
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossNoneWeight(), input=(x, y))
+        self.run_test(CrossEntropyLossNoneWeight(ignore_index), input=(x, y))
 
         class CrossEntropyLossSum(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossSum, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum')
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum')
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossSum(), input=(x, y))
+        self.run_test(CrossEntropyLossSum(ignore_index), input=(x, y))
 
         class CrossEntropyLossSumWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossSumWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5))
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossSumWeight(), input=(x, y))
+        self.run_test(CrossEntropyLossSumWeight(ignore_index), input=(x, y))
 
         class CrossEntropyLossMean(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossMean, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss()
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss()
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossMean(), input=(x, y))
+        self.run_test(CrossEntropyLossMean(ignore_index), input=(x, y))
 
         class CrossEntropyLossMeanWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossMeanWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5))
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossMeanWeight(), input=(x, y))
-
-        class CrossEntropyLossNoneIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossNoneIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossNoneIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossNoneWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossNoneWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossNoneWeightIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossSumIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossSumIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossSumIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossSumWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossSumWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossSumWeightIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossMeanIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossMeanIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(ignore_index=1)
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossMeanIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossMeanWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossMeanWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
+        self.run_test(CrossEntropyLossMeanWeight(ignore_index), input=(x, y))
 
-        self.run_test(CrossEntropyLossMeanWeightIgnoreIndex(), input=(x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()   # Output dtype mismatch
     def test_kldiv_loss(self):
 
         x = torch.randn(5)
@@ -3933,7 +4039,7 @@ def forward(self, input, target):
         self.run_test(KLDivLossMiniBatchMean(), input=(x, y))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3948,10 +4054,13 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16)
         target = torch.empty(N, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_none(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3967,10 +4076,13 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3986,10 +4098,13 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_sum(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4005,10 +4120,13 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_weights(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4024,10 +4142,13 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_ignore_index(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4046,7 +4167,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_ignore_index_weights(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4188,6 +4309,7 @@ def forward(self, cond, input, other):
         self.run_test(Model(), (x, y, z))
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()   # symbolic update needed for unbind: ONNX export of unbind with dynamic number of outputs
     def test_where_condition(self):
         class Model1(torch.nn.Module):
             def forward(self, input):
@@ -4217,6 +4339,7 @@ def forward(self, input):
                 else:
                     pass
                 return out
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(EmptyBranchModel(), x)
 
@@ -4243,6 +4366,7 @@ def __init__(self):
 
             def forward(self, x):
                 return 2 * x
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         f = io.BytesIO()
         torch.onnx._export(Model(), x, f)
@@ -4251,13 +4375,15 @@ def forward(self, x):
 
         def check_proto():
             torch._C._check_onnx_proto(model.SerializeToString())
+
         self.assertRaises(RuntimeError, check_proto)
 
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_split_tensor_scalar(self):
         class SplitModel(torch.nn.Module):
             def forward(self, x):
                 return torch.split(x, x.size(1))
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(SplitModel(), x)
 
@@ -4265,10 +4391,12 @@ def test_split_tensor_multi(self):
         class SplitModel(torch.nn.Module):
             def forward(self, x):
                 return torch.split(x, torch.ones(3))
+
         x = torch.randn(1, 2, 3, requires_grad=True)
 
         def run_model():
             SplitModel(x)
+
         self.assertRaises(TypeError, run_model)
 
     def _dispatch_rnn_test(self, name, *args, **kwargs):
@@ -4422,7 +4550,8 @@ def forward(self, x):
 
         model.train()
 
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                   training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
 
@@ -4448,7 +4577,8 @@ def forward(self, x):
 
         model.train()
 
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                   training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
 
         y = model(input)
@@ -4476,11 +4606,14 @@ def forward(self, x):
 
         model = MyModule()
         x = torch.randn(10, 3, 128, 128)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL)
+        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.EVAL)
         ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)]
+        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
+         zip(ort_outs1, ort_outs2)]
 
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
@@ -4494,7 +4627,6 @@ def __init__(self):
                 self.relu = torch.nn.ReLU(inplace=True)
                 self.maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
-
             def forward(self, x):
                 x = self.conv1(x)
                 x = self.bn(x)
@@ -4510,11 +4642,14 @@ def forward(self, x):
 
         model = MyModule()
         x = torch.randn(2, 3, 224, 224)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL)
+        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.EVAL)
         ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)]
+        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
+         zip(ort_outs1, ort_outs2)]
 
 def make_test(name, base, layer, bidirectional, initial_state,
               variable_length, dropout,
@@ -4527,7 +4662,7 @@ def make_test(name, base, layer, bidirectional, initial_state,
 
     # Cannot export with older opsets because of 'ConstantFill' op
     # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime
-    @disableScriptTest()
+    @disableScriptTest()  # Test code not scriptable
     @skipIfUnsupportedMinOpsetVersion(9)
     def f(self):
         self._dispatch_rnn_test(
@@ -4542,7 +4677,6 @@ def f(self):
     f.__name__ = test_name
     setattr(TestONNXRuntime, f.__name__, f)
 
-
 def setup_rnn_tests():
     layers_opts = [
         (1, 'unilayer'),
@@ -4567,13 +4701,12 @@ def setup_rnn_tests():
     ]
     test_count = 0
     for (layer, bidirectional, initial_state, variable_length, dropout) in \
-        itertools.product(
-            layers_opts,
-            bidirectional_opts,
-            initial_state_opts,
-            variable_length_opts,
-            dropout_opts,
-    ):
+            itertools.product(
+                layers_opts,
+                bidirectional_opts,
+                initial_state_opts,
+                variable_length_opts,
+                dropout_opts,):
 
         for base, name, extra_kwargs in (
                 ('elman', 'elman_relu', {'nonlinearity': u'relu'}),
@@ -4594,7 +4727,6 @@ def setup_rnn_tests():
     if test_count != 192:
         raise ValueError('Expected 192 tests but found {}'.format(test_count))
 
-
 setup_rnn_tests()
 
 
@@ -4654,17 +4786,25 @@ def setup_rnn_tests():
                                     dict(TestONNXRuntime.__dict__, opset_version=12,
                                          keep_initializers_as_inputs=False))
 
-# opset 9 tests, with use_new_jit_passes=True for using new jit API
-TestONNXRuntime_opset9_new_jit_API = type(str("TestONNXRuntime_opset9_new_jit_API"),
-                                          (unittest.TestCase,),
-                                          dict(TestONNXRuntime.__dict__,
-                                               use_new_jit_passes=True))
-
-# opset 12 tests, with use_new_jit_passes=True for using new jit API
-TestONNXRuntime_opset12_new_jit_API = type(str("TestONNXRuntime_opset12_new_jit_API"),
-                                           (unittest.TestCase,),
-                                           dict(TestONNXRuntime.__dict__, opset_version=12,
-                                                use_new_jit_passes=True))
+
+# opset 9 tests, with use_new_jit_passes=True for using new jit API,
+# and with keep_initializers_as_inputs=False for IR version 4 style export.
+TestONNXRuntime_opset9_IRv4_new_jit_API = type(str("TestONNXRuntime_opset9_IRv4_new_jit_API"),
+                                               (unittest.TestCase,),
+                                               dict(TestONNXRuntime.__dict__,
+                                                    keep_initializers_as_inputs=False,
+                                                    use_new_jit_passes=True,
+                                                    onnx_shape_inference=True))
+
+
+# opset 12 tests, with use_new_jit_passes=True for using new jit API,
+# and keep_initializers_as_inputs=False for IR version 4 style export.
+TestONNXRuntime_opset12_IRv4_new_jit_API = type(str("TestONNXRuntime_opset12_IRv4_new_jit_API"),
+                                                (unittest.TestCase,),
+                                                dict(TestONNXRuntime.__dict__, opset_version=12,
+                                                     keep_initializers_as_inputs=False,
+                                                     use_new_jit_passes=True,
+                                                     onnx_shape_inference=True))
 
 
 # opset 12 tests, with _onnx_shape_inference=True.
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
new file mode 100644
index 000000000000..b0b56d9296c7
--- /dev/null
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import torch
+
+import copy
+
+import test_pytorch_onnx_onnxruntime
+from test_pytorch_onnx_onnxruntime import TestONNXRuntime
+from torch.onnx import utils, OperatorExportTypes, TrainingMode
+from torch.onnx.utils import _validate_dynamic_axes
+from torch.onnx.symbolic_helper import (_set_opset_version, _set_operator_export_type,
+                                        _set_onnx_shape_inference, _set_training_mode,
+                                        _is_tensor_list, _is_tensor, _is_none)
+
+
+def verify_inferred_shape(graph):
+    # Check every node in graph has type properly assigned.
+    for n in graph.nodes():
+        for out in n.outputs():
+            if not _is_tensor_list(out) and not _is_tensor(out) and not _is_none(out):
+                raise RuntimeError("Output of node is neither type Tensor nor type list of Tensor: ", out)
+            if _is_tensor(out) and out.type().scalarType() is None:
+                raise RuntimeError("Output of node does not have type assigned", out)
+            if _is_tensor(out) and out.type().dim() is None:
+                raise RuntimeError("Output of node does not have shape assigned", out)
+
+
+def run_model_test(self, model, batch_size=2, state_dict=None,
+                   input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                   example_outputs=None, do_constant_folding=True,
+                   dynamic_axes=None, test_with_inputs=None,
+                   input_names=None, output_names=None,
+                   fixed_batch_size=False):
+    model.eval()
+
+    if input is None:
+        input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)
+
+    with torch.no_grad():
+        if isinstance(input, torch.Tensor):
+            input = (input,)
+        # In-place operators will update input tensor data as well.
+        # Thus inputs are replicated before every forward call.
+        input_copy = copy.deepcopy(input)
+        output = model(*input_copy)
+        if isinstance(output, torch.Tensor):
+            output = (output,)
+
+        _set_opset_version(self.opset_version)
+        _set_operator_export_type(OperatorExportTypes.ONNX)
+        _set_onnx_shape_inference(True)
+        _set_training_mode(False)
+        if dynamic_axes is None:
+            dynamic_axes = {}
+        _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+        input_copy = copy.deepcopy(input)
+        graph, _, _ = utils._model_to_graph(model, input_copy,
+                                            input_names=input_names,
+                                            output_names=output_names,
+                                            operator_export_type=OperatorExportTypes.ONNX,
+                                            example_outputs=output,
+                                            do_constant_folding=do_constant_folding,
+                                            training=TrainingMode.EVAL,
+                                            use_new_jit_passes=self.use_new_jit_passes,
+                                            dynamic_axes=dynamic_axes)
+        verify_inferred_shape(graph)
+
+
+if __name__ == '__main__':
+    TestONNXRuntime.opset_version = 12
+    test_pytorch_onnx_onnxruntime.run_model_test = run_model_test
+
+    unittest.main()
diff --git a/test/package_a/__init__.py b/test/package_a/__init__.py
new file mode 100644
index 000000000000..4761b3db5e41
--- /dev/null
+++ b/test/package_a/__init__.py
@@ -0,0 +1,7 @@
+result = 'package_a'
+
+class PackageAObject:
+    __slots__ = ['obj']
+
+    def __init__(self, obj):
+        self.obj = obj
diff --git a/test/package_a/subpackage.py b/test/package_a/subpackage.py
new file mode 100644
index 000000000000..46f729d51852
--- /dev/null
+++ b/test/package_a/subpackage.py
@@ -0,0 +1,3 @@
+result = 'package_a.subpackage'
+class PackageASubpackageObject:
+    pass
diff --git a/test/print_test_stats.py b/test/print_test_stats.py
index 522e6652efe1..339f6800f61b 100755
--- a/test/print_test_stats.py
+++ b/test/print_test_stats.py
@@ -84,6 +84,7 @@ def build_message(test_case):
             "build_tag": os.environ.get("CIRCLE_TAG"),
             "build_sha1": os.environ.get("CIRCLE_SHA1"),
             "build_branch": os.environ.get("CIRCLE_BRANCH"),
+            "build_job": os.environ.get("CIRCLE_JOB"),
             "test_suite_name": test_case.class_name,
             "test_case_name": test_case.name,
         },
diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 91594da111c1..e54eb33770c2 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -14,6 +14,8 @@
     fuse_modules,
     quantize_dynamic,
     QuantWrapper,
+    QuantStub,
+    DeQuantStub,
     QConfig,
     default_qconfig,
     default_qat_qconfig,
@@ -21,6 +23,8 @@
     per_channel_dynamic_qconfig,
     float16_dynamic_qconfig,
     float_qparams_dynamic_qconfig,
+    register_observed_custom_module_mapping,
+    register_quantized_custom_module_mapping,
 )
 
 from torch.testing._internal.common_quantization import (
@@ -571,6 +575,115 @@ def forward(self, indices, offsets, per_sample_weights, linear_in):
         self.checkLinear(model.fc)
         self.checkDynamicQuantizedModule(quantized_model.emb, torch.nn.quantized.EmbeddingBag, torch.quint8)
 
+    @skipIfNoFBGEMM
+    def test_custom_module_class(self):
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class ObservedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_float(cls, float_module):
+                assert hasattr(float_module, 'qconfig')
+                observed = cls(float_module.conv)
+                observed.qconfig = float_module.qconfig
+                return observed
+
+        class QuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                observed_module.conv.activation_post_process = \
+                    observed_module.activation_post_process
+                quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
+        register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = QuantStub()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.custom = CustomModule()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.custom(x)
+                x = self.dequant(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = QuantStub()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv1(x)
+                x = self.conv2(x)
+                x = self.dequant(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach())
+
+        original_m.qconfig = default_qconfig
+        m = prepare(original_m)
+        self.checkObservers(m)
+        # calibration
+        m(data)
+        # all activation observers are inserted in the top level module
+
+        # check converted/quantized model
+        m = convert(m)
+        # check if the module is properly quantized
+        self.assertEqual(type(m.quant), nnq.Quantize)
+        self.assertEqual(type(m.conv), nnq.Conv2d)
+        self.assertEqual(type(m.custom.conv), nnq.Conv2d)
+        self.assertEqual(type(m.dequant), nnq.DeQuantize)
+        res = m(data)
+
+        # quantize the reference model
+        original_ref_m.eval()
+        original_ref_m.qconfig = default_qconfig
+        ref_m = prepare(original_ref_m)
+        ref_m(data)
+        ref_m = convert(ref_m)
+        ref_res = ref_m(data)
+        self.assertEqual(res, ref_res)
+
 
 @skipIfNoFBGEMM
 class TestPostTrainingDynamic(QuantizationTestCase):
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 3170bfbfe8b4..53551efb7c0f 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -9,17 +9,16 @@
 # symbolic trace
 from torch.fx import symbolic_trace
 
+from torch.fx.symbolic_trace import Tracer
+
 # graph mode quantization based on fx
 from torch.quantization import (
     QuantType,
-    fuse_fx,
     prepare_fx,
     convert_fx,
-    prepare_static_fx,
-    convert_static_fx,
-    quantize_static_fx,
-    quantize_dynamic_fx,
     prepare_qat_fx,
+    register_observed_custom_module_mapping,
+    register_quantized_custom_module_mapping,
 )
 
 from torch.quantization import (
@@ -40,6 +39,7 @@
     skip_if_no_torchvision,
     train_one_epoch,
     run_ddp,
+    LinearModelWithSubmodule,
 )
 
 from torch.testing._internal.common_quantized import (
@@ -58,7 +58,9 @@
 import itertools
 import operator
 import unittest
+import io
 
+@skipIfNoFBGEMM
 class TestQuantizeFx(QuantizationTestCase):
     def _get_conv_linear_test_cases(self):
         ''' Returns a list of test cases, with format:
@@ -151,11 +153,11 @@ def test_functional_debug(self):
             quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC
             node_occurrence = dict()
             if weight_prepack_node:
-                node_occurrence[weight_prepack_node] = 1
+                node_occurrence[weight_prepack_node] = 0
+                node_occurrence[quantized_node] = 0
             self.checkGraphModeFxOp(
                 ModuleClass(*module_constructor_inputs),
                 inputs, quant_type,
-                expected_node=quantized_node,
                 expected_node_occurrence=node_occurrence,
                 debug=True)
 
@@ -176,7 +178,8 @@ def forward(self, x):
         original = symbolic_trace(m)
         qconfig = default_dynamic_qconfig
         qconfig_dict = {'': qconfig}
-        quantized = quantize_dynamic_fx(original, qconfig_dict, debug=True)
+        prepared = prepare_fx(original, qconfig_dict)
+        quantized = convert_fx(prepared, debug=True)
         qparams = (quantized._scale_0, quantized._zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
@@ -219,14 +222,12 @@ def forward(self, x):
             for debug in [True, False]:
                 node_occurrence = dict()
                 if weight_prepack_node:
-                    if debug:
-                        node_occurrence[weight_prepack_node] = 1
-                    else:
-                        node_occurrence[weight_prepack_node] = 0
+                    node_occurrence[weight_prepack_node] = 0
                 m = ModuleClass(*module_constructor_inputs).eval()
                 m = symbolic_trace(m)
                 qconfig_dict = {"": float16_dynamic_qconfig}
-                m = quantize_dynamic_fx(m, qconfig_dict, debug=debug)
+                m = prepare_fx(m, qconfig_dict)
+                m = convert_fx(m, debug=debug)
                 self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
 
@@ -262,8 +263,7 @@ def forward(self, x):
         model = symbolic_trace(model)
 
         # QAT prepare
-        model = fuse_fx(model)
-        model = prepare_fx(model, qconfig_dict)
+        model = prepare_qat_fx(model, qconfig_dict)
 
         # ensure that running an input on CUDA works without any needed changes
         input = torch.randn(4, 1, 4, 4, device=device)
@@ -286,13 +286,19 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        model = symbolic_trace(M().eval())
+        model = M().eval()
+        model = symbolic_trace(model)
         qconfig_dict = {'': default_qconfig}
-        non_inplace_model = quantize_static_fx(
-            model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=False)
-        inplace_model = model
-        inplace_model = quantize_static_fx(
-            inplace_model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=True)
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=False)
+        test_only_eval_fn(model, self.img_data_2d)
+        non_inplace_model = convert_fx(prepared, inplace=True)
+
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=True)
+        test_only_eval_fn(model, self.img_data_2d)
+        inplace_model = convert_fx(prepared, inplace=True)
+
         non_inplace_res = non_inplace_model(self.img_data_2d[0][0])
         inplace_res = inplace_model(self.img_data_2d[0][0])
         self.assertEqual(non_inplace_res, inplace_res)
@@ -312,11 +318,101 @@ def forward(self, x):
         dict_input = {"input": torch.randn(1, 1, 1, 1)}
         m = symbolic_trace(M()).eval()
         qconfig_dict = {"": default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         m(dict_input)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(dict_input)
 
+    def test_standalone_module_class(self):
+        class StandaloneModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class CustomTracer(Tracer):
+            def is_leaf_module(self, m, module_qualified_name):
+                return (m.__module__.startswith('torch.nn') and
+                        not isinstance(m, torch.nn.Sequential)) or \
+                    isinstance(m, StandaloneModule)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.standalone = StandaloneModule()
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.standalone(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
+
+        m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval()
+        qconfig_dict = {'': default_qconfig, 'standalone_module_name': ['standalone']}
+        # check prepared model
+        m = prepare_fx(m, qconfig_dict)
+        # calibration
+        m(data)
+        # input and output of first conv, observer for standalone module
+        # will be inserted in the standalone module itself
+        count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+        # for output of conv in the standalone module
+        count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 1
+        }
+        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+
+        # check converted/quantized model
+        m = convert_fx(m)
+        count_check = {
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            ns.call_method('dequantize') : 1,
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+        count_check = {
+            # quantization of input happens in parent module
+            # quantization of output happens in the quantized conv module
+            ns.call_function(torch.quantize_per_tensor) : 0,
+            # dequantization for output happens in parent module
+            ns.call_method('dequantize') : 0,
+        }
+        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+        res = m(data)
+
+        # quantize the reference model
+        ref_m = symbolic_trace(original_ref_m).eval()
+        ref_m = prepare_fx(ref_m, qconfig_dict)
+        ref_m(data)
+        ref_m = convert_fx(ref_m)
+        ref_res = ref_m(data)
+        self.assertEqual(res, ref_res)
+
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
@@ -332,21 +428,142 @@ def forward(self, x):
 
         m = M().eval()
         m = symbolic_trace(m)
-        qconfig_dict = {'': default_qconfig, 'conv2': None}
-        m = prepare_static_fx(m, qconfig_dict)
+        qconfig_dict = {"": default_qconfig,
+                        "module_name": [("conv2", None)]}
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
             ns.call_function(torch.quantize_per_tensor),
             ns.call_module(nnq.Conv2d),
-            ns.call_method('dequantize'),
+            ns.call_method("dequantize"),
             ns.call_module(nn.Conv2d),
         ]
         self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
+    def test_qconfig_module_type(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1)
+                self.conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]}
+        m = prepare_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data)
+        m = convert_fx(m)
+        m(data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_module(nnq.Conv2d),
+            ns.call_module(nnq.Conv2d),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_function(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"object_type": [(operator.add, default_qconfig)]}
+        m = prepare_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data, data)
+        m = convert_fx(m)
+        m(data, data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_function(torch.ops.quantized.add),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_module_name_regex(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1)
+                self.conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]}
+        m = prepare_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data)
+        m = convert_fx(m)
+        m(data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_module(nnq.Conv2d),
+            ns.call_module(nnq.Conv2d),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_precedence(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(1, 1)
+                self.conv = nn.Conv2d(1, 1, 1)
+                self.module_conv1 = nn.Conv2d(1, 1, 1)
+                self.module_conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                # global
+                x = self.linear(x)
+                # global + object_type --> object_type
+                x = self.conv(x)
+                # global + object_type + module_name_regex --> module_name_regex
+                x = self.module_conv1(x)
+                # global + object_type + module_name_regex + module_name --> module_name
+                x = self.module_conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        global_qconfig = default_qconfig
+        object_type_qconfig = default_dynamic_qconfig
+        module_name_regex_qconfig = float16_dynamic_qconfig
+        module_name_qconfig = default_qat_qconfig
+        qconfig_dict = {
+            "": global_qconfig,
+            "object_type": [(nn.Conv2d, object_type_qconfig)],
+            "module_name_regex": [("module_conv*", module_name_regex_qconfig)],
+            "module_name": [("module_conv2", module_name_qconfig)]}
+        m = prepare_fx(m, qconfig_dict)
+        self.assertEqual(m.linear.qconfig, global_qconfig)
+        self.assertEqual(m.conv.qconfig, object_type_qconfig)
+        self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig)
+        self.assertEqual(m.module_conv2.qconfig, module_name_qconfig)
+
+
     def test_remove_qconfig(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -359,10 +576,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {'': default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         for name, module in m.named_modules():
             self.assertFalse(hasattr(module, 'qconfig'),
@@ -370,28 +587,8 @@ def forward(self, x):
 
     @skipIfNoFBGEMM
     def test_qat_and_script(self):
-        class TwoLayerLinear(nn.Module):
-            def __init__(self):
-                super(TwoLayerLinear, self).__init__()
-                self.fc1 = nn.Linear(5, 5)
-                self.fc2 = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.fc1(x)
-                return self.fc2(x)
 
-        class Model(nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-                self.subm = TwoLayerLinear()
-                self.fc = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.subm(x)
-                x = self.fc(x)
-                return x
-
-        model = Model()
+        model = LinearModelWithSubmodule()
         qengine = torch.backends.quantized.engine
         qconfig_dict = {'': torch.quantization.get_default_qat_qconfig(qengine)}
 
@@ -429,59 +626,172 @@ def forward(self, x):
 
     @skipIfNoFBGEMM
     def test_save_observer_state_dict(self):
-        class TwoLayerLinear(nn.Module):
-            def __init__(self):
-                super(TwoLayerLinear, self).__init__()
-                self.fc1 = nn.Linear(5, 5)
-                self.fc2 = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.fc1(x)
-                return self.fc2(x)
-
-        class Model(nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-                self.subm = TwoLayerLinear()
-                self.fc = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.subm(x)
-                x = self.fc(x)
-                return x
-
-        model = Model().eval()
+        orig = LinearModelWithSubmodule().eval()
+        model = orig
         qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
-
         # symbolically trace
         model = symbolic_trace(model)
-        model = prepare_static_fx(model, qconfig_dict)
+        model = prepare_fx(model, qconfig_dict)
 
         # run it through input
         x = torch.randn(5, 5)
         model(x)
 
-        quant = convert_static_fx(model)
+        quant = convert_fx(model)
 
         # save state_dict of model
-        import io
+        obs_dict = torch.quantization.get_observer_state_dict(model)
         b = io.BytesIO()
-        torch.save(model.state_dict(), b)
+        torch.save(obs_dict, b)
         b.seek(0)
 
         # Load the stats into new model
-        model_2 = Model().eval()
+        model_2 = orig
         model_2 = symbolic_trace(model_2)
-        model_2 = prepare_static_fx(model_2, qconfig_dict)
+        model_2 = prepare_fx(model_2, qconfig_dict)
 
         loaded_dict = torch.load(b)
-        model_2.load_state_dict(loaded_dict)
+        torch.quantization.load_observer_state_dict(model_2, loaded_dict)
 
-        quant_2 = convert_static_fx(model_2)
+        quant_2 = convert_fx(model_2)
 
         # Verify that loaded state dict produces same results.
         self.assertEqual(quant(x), quant_2(x))
 
+    @skipIfNoFBGEMM
+    def test_custom_module_class(self):
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class ObservedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_float(cls, float_module):
+                assert hasattr(float_module, 'qconfig')
+                observed = cls(float_module.conv)
+                observed.qconfig = float_module.qconfig
+                return observed
+
+        class QuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                observed_module.conv.activation_post_process = \
+                    observed_module.activation_post_process
+                quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        class DynamicallyQuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                quantized = cls(nnqd.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.custom = CustomModule()
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.custom(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach())
+
+        from torch.fx.symbolic_trace import Tracer
+
+        # define a custom tracer to not trace through the custom module
+
+        class CustomTracer(Tracer):
+            def is_leaf_module(self, m, module_qualified_name):
+                return (m.__module__.startswith('torch.nn') and
+                        not isinstance(m, torch.nn.Sequential)) or \
+                    isinstance(m, CustomModule)
+
+        # TODO: add other quant types after mixed mode support
+        for quant_type in [QuantType.STATIC]:
+            # register observed and quantized custom module classes
+            register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
+            register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
+
+            m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval()
+            qconfig_dict = {'': default_qconfig}
+            # check prepared model
+            m = prepare_fx(m, qconfig_dict)
+            # calibration
+            m(data)
+            # all activation observers are inserted in the top level module
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 3
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+
+            # check converted/quantized model
+            m = convert_fx(m)
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d) : 1,
+                ns.call_method('dequantize') : 1,
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            res = m(data)
+
+            # quantize the reference model
+            ref_m = symbolic_trace(original_ref_m).eval()
+            ref_m = prepare_fx(ref_m, qconfig_dict)
+            ref_m(data)
+            ref_m = convert_fx(ref_m)
+            ref_res = ref_m(data)
+            self.assertEqual(res, ref_res)
+
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
     """
@@ -1037,7 +1347,7 @@ def forward(self, x):
         data = torch.rand(1, 3, 10, 10)
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1132,7 +1442,7 @@ def forward(self, x):
 
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1201,7 +1511,6 @@ def _test_model_impl(
         if mode != 'static':
             model.train()
 
-        graph_module = fuse_fx(graph_module)
         prepared = prepare_fx(graph_module, qconfig_dict)
 
         if mode == 'ddp':
diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py
index 6d94919eee1f..a0fad9b80e89 100644
--- a/test/quantization/test_quantize_jit.py
+++ b/test/quantization/test_quantize_jit.py
@@ -51,6 +51,7 @@
     SkipQuantModel,
     NestedModel,
     ConvModel,
+    ConvTransposeModel,
     default_per_channel_qconfig,
     test_only_eval_fn,
     ConvBnModel,
@@ -61,6 +62,7 @@
     AnnotatedSkipQuantModel,
     AnnotatedNestedModel,
     AnnotatedConvModel,
+    AnnotatedConvTransposeModel,
     AnnotatedConvBnModel,
 )
 
@@ -74,6 +76,7 @@
 # Standard library
 import itertools
 import unittest
+import io
 
 class TestQuantizeJitPasses(QuantizationTestCase):
     """ Test graph mode quantization passes used by quantize_jit
@@ -1361,6 +1364,52 @@ def forward(self, x, y):
         FileCheck().check("quantized::embedding_bag_byte_rowwise_offsets") \
                    .run(m.graph)
 
+    @skipIfNoFBGEMM
+    def test_quantize_fork_wait(self):
+        """ Tests the case where fork and wait calls are in different subgraphs
+        Calling inline fork-wait only removes the fork call and leaves aten::wait
+        calls in the graph, with Tensor as input (instead of Future[Tensor])
+        """
+        class MainModule(nn.Module):
+            def __init__(self):
+                super(MainModule, self).__init__()
+                self.fork_ops = ForkModule()
+
+            def init_values(self, x):
+                shared_module = self.fork_ops(x)
+                self.fork_dict = shared_module
+
+            def forward(self, x):
+                val = torch.jit._wait(self.fork_ops(x))
+                return val
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+
+            def forward(self, x):
+                w = torch.ones(5, 5)
+                b = torch.zeros(5)
+                return torch.nn.functional.linear(x, w, b)
+
+        class ForkModule(nn.Module):
+            def __init__(self):
+                super(ForkModule, self).__init__()
+                self.test = TestModule()
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.test.forward, x)
+                return fut
+
+        model = MainModule().eval()
+        traced = torch.jit.trace(model, (torch.randn(5, 5),))
+        model = prepare_dynamic_jit(traced, {'' : default_qconfig})
+        model = convert_dynamic_jit(model)
+        FileCheck().check("quantized::linear_dynamic") \
+                   .run(model.graph)
+        # Make sure model save works
+        b = io.BytesIO()
+        torch.jit.save(model, b)
 
 class TestQuantizeJitOps(QuantizationTestCase):
     """ Test graph mode post training static quantization works
@@ -2683,11 +2732,11 @@ def forward(self, x):
             else:
                 # for input of FC for dynamic quant
                 assert len(attrs_with_prefix(m, '_observer_')) == 1
-                observer_name = 'DynamicQuantObserver = prim::GetAttr[name="_observer_'
+                observer_name = 'Observer = prim::GetAttr[name="_observer_'
                 FileCheck().check(observer_name) \
                            .check('prim::GetAttr[name="fc"]') \
                            .check('prim::CallMethod') \
-                           .check_not('Observer = prim::GetAttr[name="_observer_') \
+                           .check_not(observer_name) \
                            .run(m.graph)
 
 
@@ -2723,7 +2772,7 @@ def forward(self, x):
         assert len(attrs_with_prefix(m.sub.fc, '_observer_')) == 1
         FileCheck().check('prim::GetAttr[name="sub') \
                    .check('prim::CallMethod') \
-                   .check('DynamicQuantObserver = prim::GetAttr[name="_observer_') \
+                   .check('Observer = prim::GetAttr[name="_observer_') \
                    .check('prim::CallMethod') \
                    .check_not('Observer = prim::GetAttr[name="_observer_') \
                    .run(m.graph)
@@ -3124,6 +3173,35 @@ def test_conv(self):
                 inplace=False)
             self.assertEqual(model_quantized(self.img_data_2d[0][0]), result_eager)
 
+    @override_qengines
+    def test_conv_transpose(self):
+        r"""Compare the result of quantizing conv_transpose layer in
+        eager mode and graph mode
+        """
+        if not qengine_is_qnnpack():
+            return  # Currently only qnnpack is supported
+        # eager mode
+        annotated_conv_model = AnnotatedConvTransposeModel(
+            torch.backends.quantized.engine).eval()
+        conv_model = ConvTransposeModel().eval()
+        # copy the weight from eager mode so that we can
+        # compare the result of the two quantized models later
+        conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach())
+        model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d)
+        qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+        model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0])
+        model_script = torch.jit.script(conv_model)
+        result_eager = model_eager(self.img_data_2d[0][0])
+        for model_under_test in [model_traced, model_script]:
+            model_quantized = quantize_jit(
+                model_under_test,
+                qconfig_dict,
+                test_only_eval_fn,
+                [self.img_data_2d],
+                inplace=False)
+            self.assertEqual(model_quantized(self.img_data_2d[0][0]),
+                             result_eager)
+
     @override_qengines
     def test_conv_bn(self):
         r"""Compare the result of quantizing conv + bn layer in
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 3a0e6f10bf33..ceef43dca51c 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -119,7 +119,6 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type):
         X_scale = 1e-10
     return X, X_scale, X_zero_point
 
-
 class TestQuantizedOps(TestCase):
 
     """Helper function to test quantized activation functions."""
@@ -141,17 +140,17 @@ def _test_activation_function(self, X, fn_name, test_configs):
             quantized_fn: a list of the quantized functions to be tested
             reference_fn: the original reference function to be called on the
             the dequantized X
-            inplace_kwarg: the additional inplace keyword argument to test in-place
+            extra_kwargs: the additional keyword arguments
             for each test entry in ops_under_test, it must have at least the fields
-            for quantized_fn and reference_fn. If inplace_kwarg is missing, the
-            quantized function is assumed to be either inplace by default or the
-            test is not testing an inplace function.
+            for quantized_fn and reference_fn.
             output_range: the output range the operator will map to. By default, if it is
             no specified, the range will not be controlled and depend on Xmin and Xmax.
             change_zero_point: a boolean flag indicating if the zero point parameter should
             be determined based on torch_type during quantization (see sigmoid/hardsigmoid for
             examples). By default, if it is not specified, change_zero_point is assumed to be
             False and zero point will just take on the default value from X.
+            `output_is_observed`: if specified and is True, we'll append extra
+             output_scale/output_zero_point keyword argument when calling quantized op
         """
         # Retrives the default parameters from X.
         X, (scale, zero_point, torch_type) = X
@@ -163,15 +162,15 @@ def _test_activation_function(self, X, fn_name, test_configs):
         for op_group in test_configs:
             ref_op = op_group['reference_fn']
             for q_op in op_group['quantized_fn']:
+                # Retrieves the inplace keyword arguments
+                # some functions require inplace=True to test in-place.
+                extra_kwargs = op_group.get('extra_kwargs', dict())
+                output_is_observed = op_group.get('output_is_observed', False)
                 # Quantizes and dequantizes to account for max error.
                 qX = torch.quantize_per_tensor(X, scale=scale, zero_point=zero_point,
                                                dtype=torch_type)
                 dqX = qX.dequantize()
-                dqY_hat = ref_op(dqX.clone())
-
-                # Retrieves the inplace keyword arguments
-                # some functions require inplace=True to test in-place.
-                inplace_kwarg = op_group.get('inplace_kwarg', dict())
+                dqY_hat = ref_op(dqX.clone(), **extra_kwargs)
 
                 # Adjusts output_scale if needed.
                 # The output_scale determines the quantization scale for functions that
@@ -195,8 +194,11 @@ def _test_activation_function(self, X, fn_name, test_configs):
                                                    zero_point=output_zero_point,
                                                    dtype=torch_type)
 
+                if output_is_observed:
+                    extra_kwargs.update({'output_scale': scale, 'output_zero_point': zero_point})
+
                 # Finds qY using in-place or non-in-place quantized operators.
-                qY = q_op(qX, **inplace_kwarg)
+                qY = q_op(qX, **extra_kwargs)
 
                 self.assertEqual(qY, qY_hat, msg='{} - {} failed: ({} vs. {})'.format(
                     fn_name, q_op, qY, qY_hat
@@ -223,7 +225,7 @@ def test_qrelu(self, X):
                     torch.nn.quantized.functional.relu,
                 ],
                 'reference_fn': torch.nn.functional.relu,
-                'inplace_kwarg': {
+                'extra_kwargs': {
                     'inplace': True
                 }
             }
@@ -281,11 +283,30 @@ def test_qhardsigmoid(self, X):
         ]
         self._test_activation_function(X, 'hardsigmoid', hardsigmoid_test_configs)
 
+    @override_qengines
+    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
+                       qparams=hu.qparams()))
+    def test_leaky_relu_observed_output(self, X):
+        leaky_relu_test_configs = [
+            {
+                'quantized_fn': [
+                    torch.ops.quantized.leaky_relu
+                ],
+                'reference_fn': torch.nn.functional.leaky_relu,
+                'extra_kwargs': {
+                    'negative_slope': 0.1,
+                    'inplace': False,
+                },
+                'output_is_observed': True,
+            }
+        ]
+        self._test_activation_function(X, 'leaky_relu', leaky_relu_test_configs)
+
     """Tests the correctness of the quantized::relu op."""
     @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
                        qparams=hu.qparams()),
            alpha=st.floats(0.0, 1.0, allow_nan=False, allow_infinity=False))
-    def test_qrelu_leaky(self, X, alpha):
+    def test_leaky_relu(self, X, alpha):
         X, (scale, zero_point, torch_type) = X
 
         X = torch.from_numpy(X)
@@ -907,7 +928,56 @@ def test_channel_shuffle(self, X, groups):
         self.assertEqual(a_ref, a_hat.dequantize(),
                          msg="torch.nn.functional.channel_shuffle results are off")
 
-    """Tests max pool operation on quantized tensors."""
+    """Tests 1D max pool operation on quantized tensors."""
+    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=3,
+                                              min_side=1, max_side=10),
+                       qparams=hu.qparams()),
+           kernel=st.sampled_from((3, 5, 7)),
+           stride=st.sampled_from((None, 1, 2)),
+           dilation=st.integers(1, 2),
+           padding=st.integers(0, 2),
+           ceil_mode=st.booleans())
+    def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode):
+        X, (scale, zero_point, torch_type) = X
+        # Check constraints
+        assume(kernel // 2 >= padding)  # Kernel cannot be overhanging!
+        iW = X.shape[-1]
+        oW = pool_output_shape(iW, kernel, padding, stride, dilation, ceil_mode)
+        assume(oW > 0)
+
+        a = torch.from_numpy(X)
+        a_pool = torch.nn.functional.max_pool1d(a, kernel_size=kernel,
+                                                stride=stride,
+                                                padding=padding,
+                                                dilation=dilation,
+                                                ceil_mode=ceil_mode)
+        a_ref = torch.quantize_per_tensor(a_pool, scale=scale,
+                                          zero_point=zero_point, dtype=torch_type)
+        a_ref = a_ref.dequantize()
+        qa = torch.quantize_per_tensor(a, scale=scale, zero_point=zero_point,
+                                       dtype=torch_type)
+
+        ops_under_test = {
+            "torch": torch.max_pool1d,
+            "nn.functional": torch.nn.functional.max_pool1d,
+            "nn.quantized.functional": torch.nn.quantized.functional.max_pool1d
+        }
+
+        for name, op in ops_under_test.items():
+            a_hat = op(qa, kernel_size=kernel, stride=stride, padding=padding,
+                       dilation=dilation, ceil_mode=ceil_mode)
+            self.assertEqual(a_ref, a_hat.dequantize(),
+                             msg="{} results are off".format(name))
+        # Test the ops.quantized separately, because None is not treated.
+        a_hat = torch.ops.quantized.max_pool1d(
+            qa, kernel_size=_single(kernel),
+            stride=_single(kernel if stride is None else stride),
+            padding=_single(padding), dilation=_single(dilation),
+            ceil_mode=ceil_mode)
+        self.assertEqual(a_ref, a_hat.dequantize(),
+                         msg="ops.quantized.max_pool1d results are off")
+
+    """Tests 2D max pool operation on quantized tensors."""
     @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
                                               min_side=1, max_side=10),
                        qparams=hu.qparams()),
@@ -1678,12 +1748,14 @@ def test_cat_nhwc(self, X, relu):
         torch.testing.assert_allclose(out.dequantize(), ref.dequantize())
         self.assertNotEqual(out.stride(), sorted(out.stride()))
 
-    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=3,
-                                              min_side=1, max_side=2),
+    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=1, max_dims=5,
+                                              min_side=1, max_side=4),
                        qparams=hu.qparams()),
-           dim=st.integers(1, 2))
+           dim=st.integers(-1, 5))
+    @override_qengines
     def test_mean(self, X, dim):
         X, (scale, zero_point, torch_type) = X
+        assume(dim < X.ndim)
         qX = torch.quantize_per_tensor(torch.tensor(X).float(), scale, zero_point, torch_type)
 
         Y = torch.mean(qX.dequantize(), dim)
@@ -2718,11 +2790,14 @@ def test_qlinear_unpack(self, W, use_channelwise):
 
 @unittest.skipIf(sys.platform == "darwin", "Known test failure on Mac.")
 class TestQuantizedEmbeddingOps(TestCase):
-    def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate):
+    def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate, optimized_qparams):
         weights = torch.from_numpy((np.random.random_sample((
             num_embeddings, embedding_dim)) + 1).astype(np.float32))
 
-        w_packed = pack_fn(weights)
+        if bit_rate == 8:
+            w_packed = pack_fn(weights)
+        else:
+            w_packed = pack_fn(weights, optimized_qparams=optimized_qparams)
         w_unpacked = unpack_fn(w_packed)
 
         if bit_rate == 8:
@@ -2753,13 +2828,13 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe
             conversion_op = "FloatToFused2BitRowwiseQuantized"
             reverse_conversion_op = "Fused2BitRowwiseQuantizedToFloat"
 
-        def get_c2_weights(weights):
+        def get_c2_weights(weights, engine_str):
             workspace.ResetWorkspace()
 
             workspace.FeedBlob("weights", weights)
             workspace.RunOperatorOnce(
                 core.CreateOperator(
-                    conversion_op, ["weights"], ["quantized_weights"]
+                    conversion_op, ["weights"], ["quantized_weights"], engine=engine_str
                 )
             )
             emb_q = workspace.FetchBlob("quantized_weights")
@@ -2776,12 +2851,16 @@ def get_c2_weights(weights):
                 )
             return torch.from_numpy(emb_q), dequantized_data
 
-        w_packed_c2, w_unpacked_c2 = get_c2_weights(weights)
+        if optimized_qparams:
+            engine = "GREEDY"
+        else:
+            engine = ""
+        w_packed_c2, w_unpacked_c2 = get_c2_weights(weights, engine)
 
         # Compare packed weights against C2.
-        np.testing.assert_equal(w_packed.numpy(), w_packed_c2.numpy())
+        np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6)
         # Compare unpacked weights against C2
-        np.testing.assert_equal(w_unpacked.numpy(), w_unpacked_c2.numpy())
+        np.testing.assert_allclose(w_unpacked.numpy(), w_unpacked_c2.numpy(), atol=1e-6, rtol=1e-6)
 
     """ Tests the correctness of the embedding_bag_8bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),
@@ -2790,25 +2869,27 @@ def test_embedding_bag_byte_unpack(self, num_embeddings, embedding_dim):
         pack_fn = torch.ops.quantized.embedding_bag_byte_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_byte_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=8)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 8, False)
 
     """ Tests the correctness of the embedding_bag_4bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),
-           embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),)
-    def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim):
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),
+           optimized_qparams=st.booleans(),)
+    def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams):
         pack_fn = torch.ops.quantized.embedding_bag_4bit_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_4bit_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=4)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 4, optimized_qparams)
 
     """ Tests the correctness of the embedding_bag_2bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),
-           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),)
-    def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim):
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),
+           optimized_qparams=st.booleans(),)
+    def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams):
         pack_fn = torch.ops.quantized.embedding_bag_2bit_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_2bit_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=2)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 2, optimized_qparams)
 
     def embedding_bag_rowwise_offsets_run(
             self, bit_rate, num_embeddings,
diff --git a/test/quantization/test_quantized_tensor.py b/test/quantization/test_quantized_tensor.py
index fc3aa3c655eb..e919deb9d2bd 100644
--- a/test/quantization/test_quantized_tensor.py
+++ b/test/quantization/test_quantized_tensor.py
@@ -67,6 +67,75 @@ def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
 def get_supported_device_types():
     return ['cpu', 'cuda'] if torch.cuda.is_available() and not TEST_WITH_ROCM else ['cpu']
 
+# Note we explicitly cast variables to np.float32 in a couple of places to avoid
+# the default casting in Python often resuling in double precision and to make
+# sure we're doing the same numerics as C++ code.
+def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
+    xmin, xmax = np.min(x), np.max(x)
+    stepsize = (xmax - xmin) / np.float32(n_bins)
+    min_bins = np.float32(n_bins) * (np.float32(1) - np.float32(ratio))
+    xq, loss = _compress_uniform_simplified(x, bit_rate, xmin, xmax)
+
+    solutions = []  # [(left, right, loss)] # local optima solution
+
+    cur_min, cur_max, cur_loss = xmin, xmax, loss
+    thr = min_bins * stepsize
+    while cur_min + thr < cur_max:
+        # move left
+        xq, loss1 = _compress_uniform_simplified(
+            x, bit_rate, cur_min + stepsize, cur_max
+        )
+        # move right
+        xq, loss2 = _compress_uniform_simplified(
+            x, bit_rate, cur_min, cur_max - stepsize
+        )
+
+        if cur_loss < loss1 and cur_loss < loss2:
+            # found a local optima
+            solutions.append((cur_min, cur_max, cur_loss))
+        if loss1 < loss2:
+            cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
+        else:
+            cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
+    if len(solutions):
+        best = solutions[0]
+        for solution in solutions:
+            if solution[-1] < best[-1]:
+                best = solution
+        return best[1], best[0]  # xmax, xmin
+    return xmax, xmin
+
+
+def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True):
+    # affine transform to put Xq in [0,2**bit_rate - 1]
+    # Xq = (2 ** bit_rate - 1) * (Xq - xmin) / data_range
+    if fp16_scale_bias:
+        xmin = xmin.astype(np.float16).astype(np.float32)
+    data_range = xmax - xmin
+    scale = np.where(
+        data_range == 0, np.float32(1), data_range / np.float32(2 ** bit_rate - 1)
+    )
+    if fp16_scale_bias:
+        scale = scale.astype(np.float16).astype(np.float32)
+    inverse_scale = np.float32(1) / scale
+    Xq = np.clip(np.round((X - xmin) * inverse_scale), 0, np.float32(2 ** bit_rate - 1))
+    Xq = Xq * scale + xmin
+
+    # Manually compute loss instead of using np.linalg.norm to use the same
+    # accumulation order used by C++ code
+    vlen = 8
+    loss_v = np.zeros(vlen).astype(np.float32)
+    for i in range(len(Xq) // vlen * vlen):
+        loss_v[i % vlen] += (X[i] - Xq[i]) * (X[i] - Xq[i])
+    loss = np.float32(0)
+    for i in range(vlen):
+        loss += loss_v[i]
+    for i in range(len(Xq) // vlen * vlen, len(Xq)):
+        loss += (X[i] - Xq[i]) * (X[i] - Xq[i])
+    loss = np.sqrt(loss)
+
+    return Xq, loss
+
 class TestQuantizedTensor(TestCase):
     def test_qtensor(self):
         num_elements = 10
@@ -103,6 +172,36 @@ def test_qtensor(self):
                                  "quantization_scheme=torch.per_tensor_affine, " +
                                  "scale=1.0, zero_point=2)")
 
+    def test_qtensor_sub_byte(self):
+        num_elements = 10
+        scale = 1.0
+        zero_point = 2
+        for dtype in [torch.quint4x2]:
+            r = torch.ones((5, 2), dtype=torch.float)
+            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype)
+            self.assertEqual(qr.q_scale(), scale)
+            self.assertEqual(qr.q_zero_point(), zero_point)
+            self.assertTrue(qr.is_quantized)
+            self.assertFalse(r.is_quantized)
+            self.assertEqual(qr.storage().size(), 5)
+
+            int_repr = qr.int_repr()
+            for num in int_repr[0:5]:
+                self.assertEqual(num, 51)  # Packed entries, each of value 3, i.e. 00110011
+
+            # Test tensor creation
+            q = torch._empty_affine_quantized([num_elements], scale=scale, zero_point=zero_point,
+                                              dtype=torch.quint4x2)
+            self.assertEqual(q.storage().size(), 5)
+
+            # Test save/load
+            with tempfile.NamedTemporaryFile() as f:
+                torch.save(qr, f)
+                f.seek(0)
+                loaded_q = torch.load(f)
+                loaded_int_repr = loaded_q.int_repr()[0:5]
+                self.assertEqual(int_repr[0:5], loaded_int_repr)
+
     def test_qtensor_float_assignment(self):
         # Scalar Tensor
         # item
@@ -216,15 +315,10 @@ def test_qtensor_dtypes(self):
         r = torch.rand(3, 2, dtype=torch.float) * 4 - 2
         scale = 0.2
         zero_point = 2
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint8)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.quint8)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint32)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
+        for dtype in [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2]:
+            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype)
+            rqr = qr.dequantize()
+            self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
 
     def _test_quantize_per_channel(self, r, scales, zero_points, axis, float_params):
 
@@ -335,6 +429,52 @@ def quantize_ref(data, scales, zero_points):
         zero_points = torch.tensor([0.1, 0.2, 1.], dtype=torch.float)
         self._test_quantize_per_channel(r, scales, zero_points, 0, True)
 
+    def test_quantize_per_channel_sub_byte(self):
+        """ Tests the per channel quantization scheme for 4-bit qtensors.
+        The scale and zero point for this have to be in floating point. """
+        r = torch.rand(3, 2, dtype=torch.float) * 4
+        scales = torch.tensor([0.2, 0.3, 0.1], dtype=torch.float)
+        zero_points = torch.tensor([0.1, 0.2, 0.3], dtype=torch.float)
+        qr = torch.quantize_per_channel(r, scales, zero_points, 0, torch.quint4x2)
+        dequant_tensor = qr.dequantize()
+
+        def _get_qranges(bit_width):
+            if bit_width == 4:
+                return 0, 15
+
+        def _quantize_per_channel_sub_byte_ref(data, scales, zero_points, axis, bit_width):
+            dims = data.size()
+            data = data.view(-1, dims[axis], np.prod(dims[axis + 1:]))
+            qtensor_size = math.ceil(data.numel() / 2)
+            res = torch.empty(qtensor_size, dtype=torch.uint8)
+            elem_per_byte = 8 / bit_width
+            quant_min, quant_max = _get_qranges(bit_width)
+            for i in range(data.size()[0]):
+                for j in range(data.size()[1]):
+                    for k in range(data.size()[2]):
+                        inv_scale = 1.0 / scales[j]
+                        index = i * data.size()[1] * data.size()[2] + j * data.size()[2] + k
+                        qvalue = np.clip(
+                            np.round(data[i][j][k] * inv_scale + zero_points[j]), quant_min, quant_max).to(dtype=torch.int)
+                        res_idx = int(index / elem_per_byte)
+                        if (index % elem_per_byte == 0):
+                            res[res_idx] = qvalue
+                        else:
+                            res[res_idx] |= (qvalue << ((index % elem_per_byte) * bit_width))
+            return res
+
+        ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 0, 4)
+        self.assertTrue(np.allclose(qr.int_repr(), ref_res))
+        self.assertTrue(np.allclose(r.numpy(), dequant_tensor.numpy(), atol=1 / np.min(scales.numpy())))
+
+        # Check 4D tensor with non-zero axis.
+        r = torch.rand(3, 2, 4, 5, dtype=torch.float) * 4
+        scales = torch.tensor([0.2, 0.03], dtype=torch.float)
+        zero_points = torch.tensor([0.1, 0.2], dtype=torch.float)
+        qr = torch.quantize_per_channel(r, scales, zero_points, axis=1, dtype=torch.quint4x2)
+        ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 1, 4)
+        self.assertTrue(np.allclose(qr.int_repr(), ref_res))
+
     def test_qtensor_permute(self):
         scale = 0.02
         zero_point = 1
@@ -422,7 +562,9 @@ def test_qtensor_per_channel_load_save(self):
         scales = torch.rand(10, dtype=torch.double) * 0.02 + 0.01
         zero_points = torch.round(torch.rand(10) * 20 + 1).to(torch.long)
         # quint32, cuda is not supported yet
-        for dtype in [torch.quint8, torch.qint8]:
+        for dtype in [torch.quint8, torch.qint8, torch.quint4x2]:
+            if dtype == torch.quint4x2:
+                zero_points = torch.ones(10, dtype=torch.float)
             qr = torch.quantize_per_channel(r, scales, zero_points, 1, dtype)
             with tempfile.NamedTemporaryFile() as f:
                 # Serializing and Deserializing Tensor
@@ -745,3 +887,11 @@ def test_fp16_saturate_op(self):
         ref[0] = torch.ones(5) * -65504
         y = torch._saturate_weight_to_fp16(x)
         self.assertEqual(y, ref)
+
+    def test_choose_qparams_optimized(self):
+        for bit_width in [4, 2]:
+            x = torch.randn(64, dtype=torch.float)
+            y = torch.choose_qparams_optimized(x, numel=64, n_bins=200, ratio=0.16, bit_width=bit_width)
+            ref = param_search_greedy(x.numpy(), bit_rate=bit_width)
+            self.assertEqual(y[0].numpy(), ref[0])
+            self.assertEqual(y[1].numpy(), ref[1])
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 817e54460e07..6d1dd2b1b698 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -5,7 +5,6 @@
     PerChannelMinMaxObserver,
     MovingAverageMinMaxObserver,
     MovingAveragePerChannelMinMaxObserver,
-    MinMaxDynamicQuantObserver,
     HistogramObserver,
     RecordingObserver,
     PlaceholderObserver,
@@ -16,6 +15,7 @@
     default_per_channel_weight_observer,
     get_observer_dict,
     prepare,
+    QConfig,
 )
 
 from torch.quantization._learnable_fake_quantize import (
@@ -44,6 +44,7 @@
     QuantizationTestCase,
     AnnotatedSingleLayerLinearModel,
     test_only_eval_fn,
+    SingleLayerLinearModel,
 )
 
 from torch.testing._internal.common_quantized import (
@@ -265,25 +266,6 @@ def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
             self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
 
 
-    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=4,
-                                              min_side=1, max_side=10),
-                       qparams=hu.qparams()),
-           reduce_range=st.booleans())
-    def test_per_tensor_dynamic_quant_observers(self, X, reduce_range):
-
-        X, (scale, zero_point, torch_type) = X
-        x = torch.from_numpy(X)
-
-        obs = MinMaxDynamicQuantObserver(dtype=torch.quint8, reduce_range=reduce_range)
-
-        result = obs(x)
-        qparams = obs.calculate_qparams()
-        ref = torch._choose_qparams_per_tensor(x, reduce_range)
-
-        self.assertEqual(ref[0], qparams[0])
-        self.assertEqual(ref[1], qparams[1])
-
-
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_channel_affine, torch.per_channel_symmetric, torch.per_channel_affine_float_qparams)),
            ch_axis=st.sampled_from((0, 1, 2, 3)), reduce_range=st.booleans())
@@ -394,7 +376,7 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
 
 
     def test_observer_scriptable(self):
-        obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver(), MinMaxDynamicQuantObserver()]
+        obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver()]
         for obs in obs_list:
             scripted = torch.jit.script(obs)
 
@@ -423,7 +405,7 @@ def test_state_dict_respects_device_affinity(self):
             [device_cpu, device_cuda],
             [device_cpu, device_cuda],
             [MinMaxObserver, MovingAverageMinMaxObserver,
-             MinMaxDynamicQuantObserver, PerChannelMinMaxObserver,
+             PerChannelMinMaxObserver,
              MovingAveragePerChannelMinMaxObserver,
              # TODO: enable this (separate PR)
              # HistogramObserver,
@@ -473,6 +455,32 @@ def test_histogram_observer_save_load_state_dict(self):
         self.assertEqual(obs2.max_val.shape, torch.Size([]))
 
 
+    def test_save_load_state_dict_script(self):
+        """
+        Tests that we can save and load state_dict for observers that are scripted
+        in a quantized model.
+        """
+        obs_list = [MinMaxObserver, MovingAverageMinMaxObserver,
+                    PerChannelMinMaxObserver,
+                    MovingAveragePerChannelMinMaxObserver, HistogramObserver]
+
+        for obs in obs_list:
+            model = SingleLayerLinearModel().eval()
+            qconfig = QConfig(activation=default_observer, weight=obs)
+            qconfig_dict = {'' : qconfig}
+            scripted = torch.jit.script(model)
+            scripted = torch.quantization.prepare_jit(scripted, qconfig_dict)
+            x = torch.rand(5, 5)
+            scripted(x)
+            obs_dict = torch.quantization.get_observer_state_dict(scripted)
+
+            # Load stats
+            scripted_2 = torch.jit.script(model)
+            scripted_2 = torch.quantization.prepare_jit(scripted_2, qconfig_dict)
+            torch.quantization.load_observer_state_dict(scripted_2, obs_dict)
+            # Verify that state_dict matches exactly with original one.
+            self.assertEqual(scripted.state_dict(), scripted_2.state_dict())
+
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
     def __init__(self, *args, **kwargs):
@@ -1417,7 +1425,6 @@ def test_observers_preserve_buffers(self):
         observer_types = [
             torch.quantization.MinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.MovingAverageMinMaxObserver.with_args(dtype=torch.qint8),
-            torch.quantization.MinMaxDynamicQuantObserver.with_args(dtype=torch.qint8),
             torch.quantization.PerChannelMinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.HistogramObserver.with_args(dtype=torch.qint8),
@@ -1536,6 +1543,21 @@ def forward(self, x):
                 isinstance(fused_model.conv.bn, nn.SyncBatchNorm),
                 "Expected BN to be converted to SyncBN")
 
+    def test_syncbn_preserves_qconfig(self):
+        """
+        Makes sure that if a BatchNorm is not fused and a qconfig exists,
+        convering the module to SyncBatchNorm preserves the qconfig.
+        """
+        m = nn.Sequential(
+            nn.Conv2d(1, 1, 1),
+            nn.BatchNorm2d(1),
+        )
+        m[1].qconfig = torch.quantization.default_qconfig
+        m = torch.nn.SyncBatchNorm.convert_sync_batchnorm(m)
+        self.assertTrue(
+            hasattr(m[1], "qconfig"),
+            "missing qconfig after SyncBatchNorm conversion")
+
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @override_qengines
diff --git a/test/run_test.py b/test/run_test.py
index 606e20a6f723..2af7405e300b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -13,7 +13,7 @@
 import torch
 import torch._six
 from torch.utils import cpp_extension
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA
 import torch.distributed as dist
 from typing import Dict, Optional
 
@@ -41,6 +41,7 @@
     'test_foreach',
     'test_indexing',
     'test_jit',
+    'test_linalg',
     'test_logging',
     'test_mkldnn',
     'test_multiprocessing',
@@ -89,7 +90,8 @@
     'test_determination',
     'test_futures',
     'test_fx',
-    'test_functional_autograd_benchmark'
+    'test_functional_autograd_benchmark',
+    'test_package',
 ]
 
 WINDOWS_BLOCKLIST = [
@@ -98,7 +100,6 @@
     'distributed/rpc/test_process_group_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/test_distributed_fork',
-    'distributed/test_distributed_spawn',
 ]
 
 ROCM_BLOCKLIST = [
@@ -109,7 +110,6 @@
     'test_determination',
     'test_multiprocessing',
     'test_jit_legacy',
-    'test_tensorexpr',
     'test_type_hints',
     'test_openmp',
 ]
@@ -200,6 +200,15 @@
 
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
+JIT_EXECUTOR_TESTS = [
+    'test_jit_cuda_fuser_profiling',
+    'test_jit_cuda_fuser_legacy',
+    'test_jit_profiling',
+    'test_jit_legacy',
+    'test_jit_fuser_legacy',
+    'test_jit_fuser_te',
+    'test_tensorexpr']
+
 def print_to_stderr(message):
     print(message, file=sys.stderr)
 
@@ -305,9 +314,13 @@ def test_distributed(test_module, test_directory, options):
             'MPI not available -- MPI backend tests will be skipped')
     config = DISTRIBUTED_TESTS_CONFIG
     for backend, env_vars in config.items():
+        if sys.platform == 'win32' and backend != 'gloo':
+            continue
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
+            if sys.platform == 'win32' and not with_init_file:
+                continue
             tmp_dir = tempfile.mkdtemp()
             if options.verbose:
                 init_str = "with {} init_method"
@@ -321,9 +334,9 @@ def test_distributed(test_module, test_directory, options):
             os.environ.update(env_vars)
             if with_init_file:
                 if test_module in ["test_distributed_fork", "test_distributed_spawn"]:
-                    init_method = 'file://{}/'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/'
                 else:
-                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
@@ -446,6 +459,19 @@ def parse_args():
         nargs='*',
         help='additional arguments passed through to unittest, e.g., '
              'python run_test.py -i sparse -- TestSparse.test_factory_size_check')
+    parser.add_argument(
+        '--shard',
+        nargs=2,
+        type=int,
+        help='runs a shard of the tests (taking into account other selections), e.g., '
+        '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
+        'in the 2nd shard (the first number should not exceed the second)',
+    )
+    parser.add_argument(
+        '--exclude-jit-executor',
+        action='store_true',
+        help='exclude tests that are run for a specific jit config'
+    )
     return parser.parse_args()
 
 
@@ -513,6 +539,17 @@ def get_selected_tests(options):
         last_index = find_test_index(options.last, selected_tests, find_last_index=True)
         selected_tests = selected_tests[:last_index + 1]
 
+    if options.shard:
+        assert len(options.shard) == 2, "Unexpected shard format"
+        assert min(options.shard) > 0, "Shards must be positive numbers"
+        which_shard, num_shards = options.shard
+        assert which_shard <= num_shards, "Selected shard must be less or equal that total number of shards"
+        assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}"
+        selected_tests = selected_tests[which_shard - 1 :: num_shards]
+
+    if options.exclude_jit_executor:
+        options.exclude.extend(JIT_EXECUTOR_TESTS)
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 9d037fd7c138..6bd6925e015f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -31,7 +31,7 @@
 from torch.utils.checkpoint import checkpoint
 from torch.testing._internal.common_utils import (TEST_MKL, TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
-                                                  load_tests, random_symmetric_pd_matrix, random_symmetric_matrix,
+                                                  load_tests, random_symmetric_matrix,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
@@ -1001,6 +1001,53 @@ def gen_enable_grad():
             for _ in gen_enable_grad():
                 self.assertEqual(torch.is_grad_enabled(), False)
 
+    def test_set_grad_generator_functions_recursive(self):
+        # enable_grad_decorator_recursive and no_grad_decorator_recursive call each other
+        # recursively, to ensure that the decorators preserve the caller's setting
+        @torch.enable_grad()
+        def enable_grad_decorator_recursive(depth):
+            self.assertTrue(torch.is_grad_enabled())
+            if depth > 0:
+                no_grad_decorator_recursive(depth - 1)
+                self.assertTrue(torch.is_grad_enabled())
+
+        @torch.no_grad()
+        def no_grad_decorator_recursive(depth):
+            self.assertFalse(torch.is_grad_enabled())
+            if depth > 0:
+                enable_grad_decorator_recursive(depth - 1)
+                self.assertFalse(torch.is_grad_enabled())
+
+        # enable_grad_context_manager_recursive and no_grad_context_manager_recursive call
+        # each other recursively, to ensure that the decorators preserve the caller's setting
+        def enable_grad_context_manager_recursive(depth):
+            with torch.enable_grad():
+                self.assertTrue(torch.is_grad_enabled())
+                if depth > 0:
+                    no_grad_context_manager_recursive(depth - 1)
+                    self.assertTrue(torch.is_grad_enabled())
+
+        def no_grad_context_manager_recursive(depth):
+            with torch.no_grad():
+                self.assertFalse(torch.is_grad_enabled())
+                if depth > 0:
+                    enable_grad_context_manager_recursive(depth - 1)
+                    self.assertFalse(torch.is_grad_enabled())
+
+        with torch.enable_grad():
+            self.assertTrue(torch.is_grad_enabled())
+            enable_grad_decorator_recursive(10)
+            self.assertTrue(torch.is_grad_enabled())
+            enable_grad_context_manager_recursive(10)
+            self.assertTrue(torch.is_grad_enabled())
+
+        with torch.no_grad():
+            self.assertFalse(torch.is_grad_enabled())
+            enable_grad_decorator_recursive(10)
+            self.assertFalse(torch.is_grad_enabled())
+            enable_grad_context_manager_recursive(10)
+            self.assertFalse(torch.is_grad_enabled())
+
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
         x = torch.ones(5, 5, requires_grad=True)
@@ -2454,22 +2501,28 @@ def test_var_mean_differentiable(self):
     @skipIfNoLapack
     def test_cholesky(self):
         def func(root, upper):
-            x = torch.matmul(root, root.transpose(-1, -2)) + 1e-05
+            x = 0.5 * (root + root.transpose(-1, -2).conj())
             return torch.cholesky(x, upper)
 
-        def run_test(upper, dims):
-            root = torch.rand(*dims, requires_grad=True)
+        def run_test(upper, dims, dtype):
+            root = torch.rand(*dims, dtype=dtype, requires_grad=True)
+            root = root + torch.eye(dims[-1])
 
             gradcheck(func, [root, upper])
-            gradgradcheck(func, [root, upper])
+            # TODO: gradgradcheck does not work correctly yet for complex
+            if not dtype.is_complex:
+                gradgradcheck(func, [root, upper])
 
-            root = random_symmetric_pd_matrix(dims[-1], *dims[:-2]).requires_grad_()
+            root = torch.rand(*dims, dtype=dtype)
+            root = torch.matmul(root, root.transpose(-1, -2).conj())
+            root.requires_grad_()
             chol = root.cholesky().sum().backward()
-            self.assertEqual(root.grad, root.grad.transpose(-1, -2))  # Check the gradient is symmetric
+            self.assertEqual(root.grad, root.grad.transpose(-1, -2).conj())  # Check the gradient is hermitian
 
-        for upper, dims in product([True, False], [(3, 3), (4, 3, 2, 2)]):
-            run_test(upper, dims)
-            run_test(upper, dims)
+        for upper, dims, dtype in product([True, False],
+                                          [(3, 3), (4, 3, 2, 2)],
+                                          [torch.double, torch.cdouble]):
+            run_test(upper, dims, dtype)
 
     @skipIfNoLapack
     def test_cholesky_solve(self):
@@ -2545,6 +2598,67 @@ def run_test(upper, dims):
         for upper, dims in product([True, False], [(3, 3), (5, 3, 3), (4, 3, 2, 2)]):
             run_test(upper, dims)
 
+    @slowTest
+    @skipIfNoLapack
+    def test_lobpcg(self):
+
+        def func(k, A, largest=True, B=None):
+            X_shape = list(A.shape)
+            X_shape[-1] = k
+            X = torch.eye(A.size(-2), k, dtype=A.dtype, device=A.device)
+            if A.dim() > 2:
+                X = X.expand(X_shape)
+
+            D, U = torch.lobpcg(A=A, k=k, B=B, X=X)
+
+            # LOBPCG uses a random initial eigenspace approximation
+            # if parameter `X` is not provided.
+            # This may cause a non-deterministic behavior
+            # when it comes to the sign of an eigenvector
+            # (note if v is an eigenvector, so is -v),
+            # hence we eliminate this non-determinism
+            # by making sure that each column of U
+            # gets multiplied by the sign of its max (in absolute value) element.
+            # Also, gradcheck changes the content of the input by +/- eps (default to 1e-06)
+            # to compute the numerical gradient which can also cause the signs to flip.
+            _, idx = U.abs().max(-2, keepdim=True)
+            sign = U.gather(-2, idx).sign()
+            U = U * sign
+            return D, U
+
+        def run_symeig_test(k, sizes, largest=True):
+            A = torch.rand(*sizes).double()
+            A = A.matmul(A.transpose(-1, -2)) / 10
+            A.requires_grad_(True)
+
+            gradcheck(lambda A: func(k, A, largest), A)
+
+            # Custom gradient vectors for better stability due to some
+            # non-determinism in the lobpcg's forward.
+            # Note it is not required if symeig is in forward instead (tested).
+            D_grad = torch.rand(*A.shape[:-2], k) / 100
+            U_grad = torch.rand(*A.shape[:-1], k) / 100
+            gradgradcheck(lambda A: func(k, A, largest), A, [D_grad, U_grad], atol=1e-4)
+
+            # check whether A.grad is symmetric
+            A = A.detach().requires_grad_(True)
+            D, U = func(k, A, largest)
+            (D.sum() + U.sum()).backward()
+            self.assertEqual(A.grad, A.grad.transpose(-1, -2))
+
+        # the tests below take about 1-2 minutes to finish,
+        # but we want to be extra sure that the backward is correct.
+        for largest in [True, False]:
+            run_symeig_test(1, (6, 6), largest=largest)
+            run_symeig_test(1, (2, 6, 6), largest=largest)
+            run_symeig_test(1, (2, 2, 6, 6), largest=largest)
+            run_symeig_test(2, (6, 6), largest=largest)
+            run_symeig_test(2, (2, 6, 6), largest=largest)
+            run_symeig_test(2, (2, 2, 6, 6), largest=largest)
+            run_symeig_test(3, (9, 9), largest=largest)
+            run_symeig_test(3, (2, 9, 9), largest=largest)
+            run_symeig_test(3, (2, 2, 9, 9), largest=largest)
+
     @skipIfNoLapack
     def test_cholesky_inverse(self):
         def _test_with_size(upper, dims):
@@ -4527,6 +4641,33 @@ def test(inp, inp_dtype, out_dtype):
         test(inp, torch.float, torch.double)
         test(inp, torch.double, torch.float)
 
+    def test_nan_to_num(self):
+        a = torch.randn(3, 3, 3, 3)
+        with torch.no_grad():
+            a[torch.rand_like(a) < 0.2] = float('nan')
+            a[torch.rand_like(a) < 0.2] = float('inf')
+            a[torch.rand_like(a) < 0.2] = -float('inf')
+
+        a.requires_grad = True
+
+        gradcheck(lambda x: x.nan_to_num(), a)
+        gradgradcheck(lambda x: x.nan_to_num(), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(neginf=-2.0), a)
+
     def test_custom_function_error(self):
         class BadFw(Function):
             @staticmethod
@@ -4690,10 +4831,11 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 complex_list = ['t', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone',
                 'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
-                'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'round',
+                'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests
 
+# TODO(@anjali411): add tests for 'sub', 'div
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
 # complex_list += ['fill_', 't', '__rdiv__', 'tanh']
 
@@ -4817,7 +4959,9 @@ def fn(*inputs):
                                         'broadcast_all' in test_name or
                                         'atanh' in test_name or
                                         'acosh' in test_name or
-                                        'asinh' in test_name)
+                                        'asinh' in test_name or
+                                        'abs_complex' in test_name or
+                                        'abs_scalar_complex' in test_name)
                         if hasattr(torch.ones(1), inplace_name) and not skip_inplace:
                             output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
                             if not isinstance(output_variable, tuple):
@@ -4864,7 +5008,10 @@ def fn(*inputs):
                 inplace_name = name + '_'
                 # can't broadcast inplace to left hand side
                 broadcast_skip_inplace = 'broadcast_lhs' in test_name or 'broadcast_all' in test_name
-                if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace:
+                # skip C -> R inplace tests
+                skip_c_to_r_inplace = 'abs_complex' in test_name or 'abs_scalar_complex' in test_name
+                skip_inplace = broadcast_skip_inplace or skip_c_to_r_inplace
+                if hasattr(torch.ones(1), inplace_name) and not skip_inplace:
                     check(inplace_name)
 
             assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
@@ -5936,11 +6083,13 @@ class TestAutogradDeviceType(TestCase):
 
     def test_min_max_median_backprops_to_all_values(self, device):
         for f in [torch.min, torch.max, torch.median]:
-            x = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True)
-            y = f(x)
-            y.backward()
-            self.assertEqual(x.grad.sum(), 1.)
-            self.assertEqual((x.grad == 1 / 3).sum(), 3)
+            x1 = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True)
+            x2 = torch.tensor([float('nan'), float('nan'), float('nan')], requires_grad=True)
+            for x in [x1, x2]:
+                y = f(x)
+                y.backward()
+                self.assertEqual(x.grad.sum(), 1.)
+                self.assertEqual((x.grad == 1 / 3).sum(), 3)
 
     # skip this test if running on rocm, because in cdist
     # we use __shfl_down_sync on CUDA for fast reduction
@@ -5990,6 +6139,18 @@ def _test_euclidean_large_cdist(sizex, sizey=None):
         _test_cdist_for_size((1, 1), (S, 1))
         _test_euclidean_large_cdist((2000, 5))
 
+    # Ensure that cdist backward with p<1 does not produce NaNs
+    def test_cdist_grad_p_lt_1_no_nan(self, device):
+        for p in [0.99, 0.7, 0.5, 0.1, 0.01]:
+            x = torch.randn(1, 2, device=device)
+            y = x.clone().detach() + torch.tensor([[1., 0.]], device=device)
+            x.requires_grad = True
+            y.requires_grad = True
+            result = torch.cdist(x, y, p=p)
+            result.backward(torch.ones_like(result))
+            self.assertFalse(torch.isnan(x.grad).any())
+            self.assertFalse(torch.isnan(y.grad).any())
+
     def test_cdist_same_inputs(self, device):
         # Test to detect issues in cdist gradient calculation
         # When the distances are 0
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 011e8c374645..498fd199066f 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1,6 +1,7 @@
 import collections
 import io
 import tempfile
+from typing import NamedTuple
 import unittest
 import sys
 from itertools import repeat, chain, product
@@ -14,6 +15,7 @@
 import torch.cuda
 import torch.cuda.comm as comm
 from torch import multiprocessing as mp
+from torch.nn.parallel import scatter_gather
 from torch._six import inf, nan, container_abcs
 
 from test_torch import AbstractTestCases
@@ -21,7 +23,7 @@
 from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \
     _compare_trilu_indices, _compare_large_trilu_indices
 from torch.testing._internal.common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \
-    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, \
+    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, \
     slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY
 from torch.testing._internal.autocast_test_lists import AutocastTestLists
 
@@ -279,6 +281,18 @@ def assert_change(comp=1, empty_cache=False, reset_peak=False):
         assert_change(0, empty_cache=True)
         assert_change(0, reset_peak=True)
 
+    @skipIfRocm
+    def test_cudart_register(self):
+        t = torch.ones(20)
+        self.assertFalse(t.is_pinned())
+        cudart = torch.cuda.cudart()
+        r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
+        self.assertEquals(r, 0)
+        self.assertTrue(t.is_pinned())
+        r = cudart.cudaHostUnregister(t.data_ptr())
+        self.assertEquals(r, 0)
+        self.assertFalse(t.is_pinned())
+
     def test_memory_stats(self):
         gc.collect()
         torch.cuda.empty_cache()
@@ -1720,6 +1734,7 @@ def test_streaming_backwards_device_transfer(self):
         self.assertTrue(b.grad.sum().item() == 4 * size)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @unittest.skipIf(not IS_SANDCASTLE, "Does not work on Sandcastle")
     def test_cuda_init_race(self):
         # See https://github.com/pytorch/pytorch/issues/16559
         import subprocess
@@ -1736,32 +1751,102 @@ def worker(rank):
 t2.start()
 """])
 
-    def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float):
-        inv_scale = torch.tensor([0.25], dtype=dtype, device=device)
+    def test_grad_scaling_unscale(self, dtype=torch.float):
+        inv_scale = torch.full((1,), 0.25, dtype=torch.float, device="cuda:0")
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
+
+        size = 10
+        g = torch.full((size, size), 4.0, dtype=dtype, device="cuda:0")
+        ginf = g.clone()
+        ginf[2, 2] = float('inf')
+        gnan = g.clone()
+        gnan[2, 2] = float('nan')
+
+        # Tries selected combinations of
+        #  - contiguous grads
+        #  - g.clone().t() which is not contiguous but still non overlapping and dense
+        #  - variants of g.clone()[:, :5] which are not non overlapping and dense
+        # Non overlapping and dense grads route into a multi tensor apply kernel,
+        # others use a fallback per-tensor kernel, so we should try both.
+        cases = (
+            ([g.clone(), g.clone()], False),
+            ([g.clone(), g.clone().t()], False),
+            ([g.clone(), g.clone()[:, :5]], False),
+            ([g.clone()[:, :5], g.clone()[:, :5]], False),
+            ([g.clone(), ginf.clone()], True),
+            ([g.clone(), gnan.clone()], True),
+            ([g.clone(), ginf.clone()[:, :5]], True),
+            ([g.clone(), gnan.clone()[:, :5]], True),
+            ([ginf.clone(), g.clone()[:, :5]], True),
+            ([ginf.clone()[:, :5], g.clone()[:, :5]], True),
+        )
+
+        for grads, has_inf in cases:
+            found_inf.zero_()
+            torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
+            if has_inf:
+                self.assertEqual(found_inf, 1.0)
+            else:
+                self.assertEqual(found_inf, 0.0)
+                for grad in grads:
+                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
 
-        found_inf = torch.tensor([0.0], dtype=dtype, device=device)
-        g = torch.tensor([4.0], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 0.0)
-        self.assertTrue(torch.allclose(g, torch.ones(10, dtype=torch.float32, device="cuda"), atol=1e-7))
+        # Passing lists with mismatched devices or dtypes to a raw
+        # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
+        with self.assertRaisesRegex(RuntimeError, r"must have the same dtype"):
+            torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(dtype=torch.float16)],
+                                                             found_inf,
+                                                             inv_scale)
 
-        found_inf.zero_()
-        g = torch.tensor([float('inf')], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 1.0)
+        if TEST_MULTIGPU:
+            with self.assertRaisesRegex(RuntimeError, r"scaled_grads must be on the same device."):
+                torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")],
+                                                                 found_inf,
+                                                                 inv_scale)
+
+        # Creates a list of grads with mismatched dtypes and devices, to ensure
+        # scaler._unscale_grads_ organizes grads by dtype and device before calling
+        # _amp_foreach_non_finite_check_and_unscale_ on each set.
+        # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find.
+        def perfect_storm_grads(inject_inf):
+            grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)]
+            if TEST_MULTIGPU:
+                grads += [g.to(device="cuda:1"),
+                          g.to(device="cuda:1")[:, :5],
+                          g.to(device="cuda:1", dtype=torch.float16),
+                          g.to(device="cuda:1", dtype=torch.float16)]
+            if inject_inf >= 0:
+                grads[inject_inf][2, 2] = float('inf')
+            return grads
 
-        found_inf.zero_()
-        g = torch.tensor([float('nan')], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 1.0)
+        scaler = torch.cuda.amp.GradScaler()
+        dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)]
+        dummy_opt = torch.optim.SGD(dummy_params, lr=1.)
+
+        # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm.
+        for inject_inf in range(-1, len(dummy_params)):
+            found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
+            grads = perfect_storm_grads(inject_inf)
+            for i, p in enumerate(dummy_params):
+                p.grad = grads[i]
+            found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True)
+            if inject_inf < 0:
+                # No inf was injected, ensures unscaling worked normally.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0)
+                for grad in grads:
+                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+            else:
+                # inf was injected, ensures inf was found.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1)
 
+    def test_grad_scaling_update_scale(self, device="cuda", dtype=torch.float):
         growth = 2.0
         backoff = 0.25
         growth_interval = 2
-        scale = torch.tensor([4.0], dtype=dtype, device=device)
-        growth_tracker = torch.tensor([0], dtype=torch.int32, device=device)
+        scale = torch.full((1,), 4.0, dtype=dtype, device=device)
+        growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
 
-        found_inf.zero_()
         # Simulates 2 consecutive unskipped iterations
         scale = torch._amp_update_scale(growth_tracker, scale, found_inf, growth, backoff, growth_interval)
         self.assertEqual(growth_tracker, 1)
@@ -1779,7 +1864,7 @@ def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float):
     def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
         scaler = torch.cuda.amp.GradScaler()
 
-        inv_scale = torch.tensor([0.25], dtype=dtype, device=device)
+        inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device)
         found_inf = torch.empty((1,), dtype=dtype, device=device)
         cur = found_inf.device
 
@@ -1842,6 +1927,7 @@ def test_grad_scaling_device_as_key(self):
         # are treated as identical keys by dicts.  GradScaler relies on this behavior, and may
         # error otherwise in a way that's difficult to detect (a silent performance hit).
         d = {}
+        t = torch.empty((1,), device="cuda:0")
         dev0a = torch.device("cuda:0")
         dev0b = torch.device("cuda:0")
         dev1a = torch.device("cuda:1")
@@ -1854,6 +1940,9 @@ def test_grad_scaling_device_as_key(self):
         d[dev0b] = "0b"
         self.assertTrue(len(d) == 1)
         self.assertTrue(d[dev0a] == "0b")
+        d[t.device] = "t"
+        self.assertTrue(len(d) == 1)
+        self.assertTrue(d[dev0a] == "t")
 
         d[dev1a] = "1a"
         d[dev1b] = "1b"
@@ -1863,8 +1952,8 @@ def test_grad_scaling_device_as_key(self):
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_scale(self):
         scaler = torch.cuda.amp.GradScaler(init_scale=2.)
-        t0 = torch.tensor([4.0], dtype=torch.float32, device="cuda:0")
-        t1 = torch.tensor([4.0], dtype=torch.float32, device="cuda:1")
+        t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
+        t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
         # Create some nested iterables of tensors on different devices.
         outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
         outputs = scaler.scale(outputs)
@@ -1882,7 +1971,7 @@ def test_grad_scaling_state_dict(self):
 
             if lazy_init_scale:
                 # Dummy scale() call to ensure the scale tensor is lazily initialized.
-                s1.scale(torch.tensor([4.0], dtype=torch.float32, device="cuda:0"))
+                s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0"))
                 self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
 
             s1.load_state_dict(s0.state_dict())
@@ -2393,7 +2482,7 @@ def cast(val, to_type):
                             "{} not found as an attribute on either Tensor or the requested module {}".format(
                             op, module))
 
-            # Accounts for ops that return tuples and other non-Tensors.
+            # Accounts for ops that return Tensors, iterables, and other non-Tensors.
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
@@ -3047,6 +3136,48 @@ def test_matmul_device_mismatch(self):
                 with self.assertRaisesRegex(RuntimeError, "expected (it|them) to be on GPU"):
                     torch.addmm(s, m1, m2)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
+    def test_scatter_namedtuple(self):
+        # tests ability to scatter namedtuples and retrieve a list where each
+        # element is of the expected namedtuple type.
+        fields = ("a", "b")
+        TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)
+        num_gpus = torch.cuda.device_count()
+        a = torch.rand(num_gpus * 2, device=0)
+        b = torch.rand(num_gpus * 2, device=0)
+        a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+
+        inp = TestNamedTupleInput_0(a, b)
+        target_gpus = [torch.device(i) for i in range(num_gpus)]
+        scatter_out = scatter_gather.scatter(inp, target_gpus)
+
+        for i, x in enumerate(scatter_out):
+            self.assertTrue(isinstance(x, type(inp)))
+            self.assertEqual(x._fields, fields)
+            expected_a = a_tensors_for_gpu[i]
+            expected_b = b_tensors_for_gpu[i]
+            self.assertEqual(expected_a, x.a)
+            self.assertEqual(expected_b, x.b)
+
+        class TestNamedTupleInput_1(NamedTuple):
+            a: torch.tensor
+            b: torch.tensor
+
+        a = torch.rand(num_gpus * 2, device=0)
+        b = torch.rand(num_gpus * 2, device=0)
+        a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        inp = TestNamedTupleInput_1(a, b)
+
+        scatter_out = scatter_gather.scatter(inp, target_gpus)
+        for i, x in enumerate(scatter_out):
+            self.assertTrue(isinstance(x, type(inp)))
+            self.assertEqual(x._fields, fields)
+            expected_a = a_tensors_for_gpu[i]
+            expected_b = b_tensors_for_gpu[i]
+            self.assertEqual(expected_a, x.a)
+            self.assertEqual(expected_b, x.b)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index ce23593ec7bc..9074cc3c0b7d 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -11,8 +11,10 @@
 import itertools
 import warnings
 import tempfile
+import random
 from torch import multiprocessing as mp
-from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset
+from torch.utils.data import (_utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset,
+                              ChainDataset, BufferedShuffleDataset)
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
 from torch._utils import ExceptionWrapper
@@ -710,6 +712,10 @@ def init_fn(worker_id):
     torch.manual_seed(12345)
 
 
+def shuffle_ds_init_fn(worker_id):
+    random.seed(123)
+
+
 # used with test_error_in_init
 class ErrorIterableDataset(IterableDataset):
     def __iter__(self):
@@ -1213,6 +1219,37 @@ def test_chain_iterable_style_dataset(self):
         with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
+    def test_buffer_shuffle_dataset(self):
+        dataset = CountingIterableDataset(20)
+        expected = list(range(20))
+        buffer_sizes = [5, 20, 25]
+        for num_workers in [0, 1]:
+            # Buffer Size <= 1: Not shuffled dataset
+            fetched_nos = list(self._get_data_loader(BufferedShuffleDataset(dataset, 1), num_workers=num_workers))
+            self.assertEqual(len(fetched_nos), len(expected))
+            for e, d in zip(expected, fetched_nos):
+                self.assertIsInstance(d, torch.Tensor)
+                self.assertEqual(e, d)
+            # Buffer Size > 1: Shuffled dataset
+            for buffer_size in buffer_sizes:
+                fetched = sorted(list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers)))
+                self.assertEqual(len(fetched), len(expected))
+                for e, d in zip(expected, fetched):
+                    self.assertIsInstance(d, torch.Tensor)
+                    self.assertEqual(e, d)
+                # Random Seed for single process
+                random.seed(123)
+                fetched_seed1 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers,
+                                     worker_init_fn=shuffle_ds_init_fn))
+                random.seed(123)
+                fetched_seed2 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers,
+                                     worker_init_fn=shuffle_ds_init_fn))
+                self.assertEqual(len(fetched_seed1), len(fetched_seed2))
+                for d1, d2 in zip(fetched_seed1, fetched_seed2):
+                    self.assertIsInstance(d1, torch.Tensor)
+                    self.assertIsInstance(d2, torch.Tensor)
+                    self.assertEqual(d1, d2)
+
     def test_multiprocessing_contexts(self):
         reference = [
             torch.arange(3),
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index ec9fd20797e3..45480d8916f0 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -229,11 +229,11 @@ def test_def(self):
             # m.impl("test_def", [](const Tensor& x) { return x })
             lambda m: m.impl_t_t("foo"),
             # m.impl("test_def", kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="cpu"),
+            lambda m: m.impl_t_t("foo", dispatch="CPU"),
             # m.impl("test_def", kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="autograd"),
+            lambda m: m.impl_t_t("foo", dispatch="Autograd"),
             # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="autogradcpu")
+            lambda m: m.impl_t_t("foo", dispatch="AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -262,11 +262,11 @@ def test_def_with_inference(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -296,11 +296,11 @@ def test_impl_only(self):
             # m.impl("foo", [](const Tensor& x) { return x })
             lambda m: m.impl_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
             # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -316,13 +316,13 @@ def test_computed_table(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "xla", debug="fn_xla"),
+            lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu", debug="fn_autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu")
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -351,12 +351,12 @@ def test_computed_table(self):
 ''')
 
     def test_computed_table_with_cpu_catchall(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -382,12 +382,12 @@ def test_computed_table_with_cpu_catchall(self):
 ''')
 
     def test_computed_table_with_math(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math"),
+            lambda m: m.impl_t_t("foo", "Math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -412,14 +412,14 @@ def test_computed_table_with_math(self):
 ''')
 
     def test_computed_table_with_cpu_math(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math", debug="fn_math"),
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -445,12 +445,12 @@ def test_computed_table_with_cpu_math(self):
 ''')
 
     def test_computed_table_with_autograd(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -476,11 +476,11 @@ def test_computed_table_with_cpu_autograd_math_catchall(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math", debug="fn_math"),
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -512,9 +512,9 @@ def test_computed_table_with_cpu_autograd_catchall(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -538,6 +538,39 @@ def test_computed_table_with_cpu_autograd_catchall(self):
 AutogradCPU: fn_autograd [autograd kernel]
 AutogradCUDA: fn_autograd [autograd kernel]
 AutogradXLA: fn_autograd [autograd kernel]
+''')
+
+    def test_computed_table_with_ambiguous_autogradother(self):
+        result = self.commute("foo", [
+            # m.def("foo", [](const Tensor & x) { return x })
+            lambda m: m.def_name_t_t("foo"),
+            # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
+            # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x })
+            lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"),
+        ])
+        state, table = result.state, result.table
+        self.assertExpectedInline(state, '''\
+name: test::foo
+schema: test::foo(Tensor _0) -> (Tensor _0)
+debug: registered at /dev/null:0
+alias analysis kind: CONSERVATIVE
+QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+Math[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+catchall: default_def_name_t_t :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+''')
+
+        # computed dispatch table is too big, so we only check on a few entries we're interested in.
+        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+
+        self.assertExpectedInline(extracted_table, '''\
+CPU: fn_math [math kernel]
+CUDA: fn_math [math kernel]
+XLA: fn_math [math kernel]
+AutogradOther: ambiguous_autogradother [ambiguous autogradother]
+AutogradCPU: fn_math [math kernel]
+AutogradCUDA: fn_math [math kernel]
+AutogradXLA: fn_math [math kernel]
 ''')
 
     # Can't do this yet for BC reasons
@@ -631,7 +664,7 @@ def test_multiple_def_alias_mismatch(self):
         )
 
     def test_multiple_fallback(self):
-        global_m = C._dispatch_library("IMPL", "_", "xla")
+        global_m = C._dispatch_library("IMPL", "_", "XLA")
         global_m.fallback_fallthrough(),
         try:
             global_m.fallback_fallthrough(),
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 8369ba5b9be5..683b4fe28167 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -1,24 +1,47 @@
 import torch
 import unittest
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm
 
 class TestForeach(TestCase):
-    bin_ops = [
+    foreach_bin_ops = [
         torch._foreach_add,
-        torch._foreach_add_,
         torch._foreach_sub,
-        torch._foreach_sub_,
         torch._foreach_mul,
-        torch._foreach_mul_,
         torch._foreach_div,
+    ]
+
+    foreach_bin_ops_ = [
+        torch._foreach_add_,
+        torch._foreach_sub_,
+        torch._foreach_mul_,
         torch._foreach_div_,
     ]
 
+    foreach_bin_ops_sl = [
+        torch._foreach_add_scalar_list,
+        torch._foreach_sub_scalar_list,
+        torch._foreach_mul_scalar_list,
+        torch._foreach_div_scalar_list,
+    ]
+
+    foreach_bin_ops_sl_ = [
+        torch._foreach_add_scalar_list_,
+        torch._foreach_sub_scalar_list_,
+        torch._foreach_mul_scalar_list_,
+        torch._foreach_div_scalar_list_,
+    ]
+
+    torch_bin_ops = [
+        torch.add,
+        torch.sub,
+        torch.mul,
+        torch.div,
+    ]
+
     def _get_test_data(self, device, dtype, N):
         if dtype in [torch.bfloat16, torch.bool, torch.float16]:
             tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)]
-
         elif dtype in torch.testing.get_all_int_dtypes():
             tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)]
         else:
@@ -26,50 +49,83 @@ def _get_test_data(self, device, dtype, N):
 
         return tensors
 
-    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-
-        expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
-        res = foreach_op(tensors1, tensors2)
-        foreach_op_(tensors1, tensors2)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        expected = [torch_op(tensors1[i]) for i in range(N)]
-        res = foreach_op(tensors1)
-        foreach_op_(tensors1)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors = self._get_test_data(device, dtype, N)
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-        value = 2
-
-        expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
-
-        res = foreach_op(tensors, tensors1, tensors2, value)
-        foreach_op_(tensors, tensors1, tensors2, value)
-        self.assertEqual(res, tensors)
-        self.assertEqual(tensors, expected)
-
-    def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-        alpha = 2
-
-        expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)]
-        res = foreach_op(tensors1, tensors2, alpha)
-        foreach_op_(tensors1, tensors2, alpha)
-        self.assertEqual(res, tensors1)
-
-        if dtype == torch.bool:
-            expected = [e.to(torch.bool) for e in expected]
-        self.assertEqual(tensors1, expected)
+    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype),
+                                 tensors2[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
+            res = foreach_op(tensors1, tensors2)
+            foreach_op_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
+
+    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
+            res = foreach_op(tensors1)
+            foreach_op_(tensors1)
+            self.assertEqual(res, tensors1)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
+
+    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors = self._get_test_data(device, dtype, N)
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+            value = 2
+
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors[i].to(dtype=control_dtype),
+                                 tensors1[i].to(dtype=control_dtype),
+                                 tensors2[i].to(dtype=control_dtype), value=value).to(dtype=dtype) for i in range(N)]
+
+            res = foreach_op(tensors, tensors1, tensors2, value)
+            foreach_op_(tensors, tensors1, tensors2, value)
+            self.assertEqual(res, tensors)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors, expected)
+
+    def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+            alpha = 2
+
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype),
+                                 torch.mul(tensors2[i].to(dtype=control_dtype),
+                                 alpha)).to(dtype=dtype) for i in range(N)]
+            res = foreach_op(tensors1, tensors2, alpha=alpha)
+            foreach_op_(tensors1, tensors2, alpha=alpha)
+            self.assertEqual(res, tensors1)
+
+            if dtype == torch.bool:
+                expected = [e.to(torch.bool) for e in expected]
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
 
     #
     # Unary ops
@@ -88,7 +144,7 @@ def test_exp(self, device, dtype):
     @skipCUDAIfRocm
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_addcmul(self, device, dtype):
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcmul,
@@ -105,7 +161,7 @@ def test_addcdiv(self, device, dtype):
                 self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv)
             return
 
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcdiv,
@@ -118,83 +174,398 @@ def test_addcdiv(self, device, dtype):
     #
     @dtypes(*torch.testing.get_all_dtypes())
     def test_int_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        int_scalar = 1
-
-        # bool tensor + 1 will result in int64 tensor
-        if dtype == torch.bool:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, int_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype in [torch.bool]:
-            with self.assertRaisesRegex(RuntimeError,
-                                        "result type Long can't be cast to the desired output type Bool"):
-                torch._foreach_add_(tensors, int_scalar)
-        else:
-            torch._foreach_add_(tensors, int_scalar)
-            self.assertEqual(res, tensors)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                res = foreach_bin_op(tensors, scalar)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, expected)
+
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+
+                if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                # TODO[type promotion]: Fix once type promotion is enabled.
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, [e.to(dtype) for e in expected])
+                else:
+                    self.assertEqual(res, expected)
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, expected)
+
+    # TODO[Fix scalar list]:
+    # We need to update codegen to correctly handle function overloads with float[] and int[].
+    # As optimizers work with float tensors, the result will always be torch.float32 for now.
+    # Current schema is using 'float[]' as scalar list type.
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_int_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1 for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types():
+                    if self.device_type == 'cpu':
+                        self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                    else:
+                        # TODO[type promotion]: Fix once type promotion is enabled.
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                else:
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cpu':
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+                else:
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_float_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        float_scalar = 1.
-
-        # float scalar + integral tensor will result in float tensor
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, float_scalar)
-        self.assertEqual(res, expected)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3.3
+
+                # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+                control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                                  (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+                expected = [torch_bin_op(t.to(dtype=control_dtype),
+                                         scalar) for t in tensors]
+                if (dtype is torch.float16 or dtype is torch.bfloat16):
+                    expected = [e.to(dtype=dtype) for e in expected]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types():
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                foreach_bin_op_(tensors, scalar)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(tensors, expected)
 
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar))
-        else:
-            torch._foreach_add_(tensors, float_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_float_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1.1 for _ in range(N)]
+
+                # If incoming dtype is float16 or bfloat16, runs in float32 and casts output back to dtype.
+                control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                                  (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+                expected = [torch_bin_op(t.to(dtype=control_dtype),
+                                         s) for t, s in zip(tensors, scalars)]
+                if (dtype is torch.float16 or dtype is torch.bfloat16):
+                    expected = [e.to(dtype=dtype) for e in expected]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(tensors, res)
+                    return
+                else:
+                    if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                        self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                    else:
+                        self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                foreach_bin_op_(tensors, scalars)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(tensors, expected)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_complex_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        complex_scalar = 3 + 5j
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3 + 5j
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \
+                   self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op_(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                self.assertEqual(res, expected)
+
+                if dtype not in [torch.complex64, torch.complex128]:
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                else:
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(res, tensors)
 
-        # bool tensor + 1 will result in int64 tensor
-        expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)]
-
-        if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0':
-            # value cannot be converted to dtype without overflow:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar))
-            return
-
-        res = torch._foreach_add(tensors, complex_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype not in [torch.complex64, torch.complex128]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-        else:
-            torch._foreach_add_(tensors, complex_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_complex_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [3 + 5j for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    res = foreach_bin_op(tensors, scalars)
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    foreach_bin_op_(tensors, scalars)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_bool_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        bool_scalar = True
-
-        expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, bool_scalar)
-        self.assertEqual(res, expected)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = True
+
+                if dtype == torch.bool:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                    return
+
+                if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        res = foreach_bin_op(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        foreach_bin_op_(tensors, scalar)
+                elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda':
+                    res = foreach_bin_op(tensors, scalar)
+                    self.assertEqual(res, foreach_bin_op(tensors, 1))
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                else:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    # TODO[type promotion]: Fix once type promotion is enabled.
+                    if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                    else:
+                        self.assertEqual(res, expected)
+
+                    if dtype in torch.testing.integral_types():
+                        if foreach_bin_op == torch._foreach_div and self.device_type == "cpu":
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                foreach_bin_op_(tensors, scalar)
+                        else:
+                            foreach_bin_op_(tensors, scalar)
+                            self.assertEqual(tensors, res)
+                    else:
+                        foreach_bin_op_(tensors, scalar)
+                        self.assertEqual(tensors, expected)
 
-        torch._foreach_add_(tensors, bool_scalar)
-        self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_bool_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [True for _ in range(N)]
+
+                if dtype == torch.bool:
+                    if self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+                        return
+                    else:
+                        if foreach_bin_op == torch._foreach_sub_scalar_list:
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op(tensors, scalars)
+                        else:
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            res = foreach_bin_op(tensors, scalars)
+                            for r in res:
+                                self.assertTrue(r.dtype == torch.float32)
+                else:
+                    # we dont support bool and complex types on CUDA for now
+                    if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+                        return
+
+                    if foreach_bin_op == torch._foreach_sub_scalar_list:
+                        if self.device_type == "cpu":
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(torch.float32) for r in [torch_bin_op(t, 1) for t in tensors]])
+
+                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors])
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(res, tensors)
+                        else:
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(dtype) for r in [torch_bin_op(t, 1) for t in tensors]])
+                            else:
+                                self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors])
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
+                    else:
+                        if self.device_type == "cpu":
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            # see TODO[Fix scalar list]
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            if dtype in torch.testing.integral_types():
+                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(tensors, expected)
+                        else:
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(dtype) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_add_with_different_size_tensors(self, device, dtype):
@@ -318,13 +689,25 @@ def test_div_list(self, device, dtype):
                 self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489")
             return
 
-        self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div)
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+
+            if dtype in [torch.bfloat16, torch.bool, torch.float16]:
+                tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)]
+            else:
+                tensors2 = self._get_test_data(device, dtype, N)
+
+            expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)]
+            res = torch._foreach_div(tensors1, tensors2)
+            torch._foreach_div_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, res)
 
     def test_bin_op_list_error_cases(self, device):
         tensors1 = []
         tensors2 = []
 
-        for bin_op in self.bin_ops:
+        for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_:
             # Empty lists
             with self.assertRaises(RuntimeError):
                 bin_op(tensors1, tensors2)
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index f2ad2290d326..5a1527373478 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -14,90 +14,77 @@ def test_serialize_and_deserialize(self):
             self.assertEqual(parsed_schema, schema)
             self.assertTrue(parsed_schema.is_backward_compatible_with(schema))
 
-    def test_backward_compatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int dim) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int? dim) -> Tensor')
+    def test_backward_compatible_structure(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # BC: A new schema without changes.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim=5) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, *, Tensor out) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, *, bool extra1=True, Tensor out, bool extra2=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, Tensor out) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_ret(self):
-        old_schema = parse_schema('any(Tensor self) -> Tensor?')
-        new_schema = parse_schema('any(Tensor self) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_name(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any_(Tensor self, int dim, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with different name.
+        new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_vararg(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False, ...) -> Tensor')
+        # No-BC: A new schema with different overload name.
+        new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_returns(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, ...)')
+        # No-BC: A new schema that adds vararg.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> int')
+        # No-BC: A new schema with different number of outputs.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor?')
+
+    def test_backward_compatible_outputs(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # No-BC: A new schema with output becoming of optional type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) An schema where the output is not of optional type anymore.
         self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor out')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
 
-    def test_backward_incompatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor s, int[] dims, bool keepdim=False) -> Tensor')
+    def test_backward_compatible_arguments(self):
+        old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor')
+        # No-BC: A new schema with less arguments.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[3] dims, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with more arguments, appended, but no default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[](a) dims, bool keepdim=False) -> Tensor')
+        # BC: A new schema with more arguments, appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema with more arguments, not-appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where old kwargs becomes positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where an old positional argument becomes kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dims, bool keepdim=False) -> Tensor')
-        self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where all old kwargs become positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where all old positional arguments become kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dim, bool keepdim=False, bool? extra=None) -> Tensor')
+        # No-BC: A new schema where old kwargs appear in different order.
+        new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim) -> Tensor')
+        # BC: A new schema where argument becomes of type optional.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where argument gains a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema where argument is "renamed".
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False, bool extra) -> Tensor')
+        # No-BC: A new schema where argument type changes to an incompatible type.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx.py b/test/test_fx.py
index 41607d64cbcc..1451c5efe5cb 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -6,6 +6,10 @@
 import copy
 from pathlib import Path
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
+from torch.fx.experimental import GraphManipulation
+from torch.fx.experimental import shape_prop
+from torch.fx.experimental.Partitioner import DAG, Partitioner
+from torch.fx.experimental.subgraph_creation_example import split_module
 
 from torch.fx.proxy import TraceError
 
@@ -26,6 +30,9 @@ class SimpleTest(torch.nn.Module):
     def forward(self, x):
         return torch.relu(x + 3.0)
 
+def a_non_torch_leaf(a, b):
+    return a + b
+
 class TestFX(JitTestCase):
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -34,6 +41,7 @@ def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         kwargs = kwargs if kwargs else {}
         ref_outs = m(*args, **kwargs)
         gm = symbolic_trace(m)
+        gm.graph.lint(gm)
         test_outs = gm(*args, **kwargs)
         self.assertEqual(ref_outs, test_outs)
 
@@ -79,6 +87,17 @@ def forward(self, A, b=4, *args, c=5, **kwargs):
         t = T()
         symbolic_trace(t)
 
+    def test_custom_import(self):
+        graph = torch.fx.Graph()
+        a = graph.placeholder('x')
+        b = graph.placeholder('y')
+        c = graph.call_function(a_non_torch_leaf, (a, b))
+        d = graph.call_function(torch.sin, (c,))
+        graph.output(d)
+        gm = GraphModule(torch.nn.Module(), graph)
+        x, y = torch.rand(1), torch.rand(1)
+        self.assertEqual(torch.sin(x + y), gm(x, y))
+
     def test_args_kwargs(self):
         class T(torch.nn.Module):
             def forward(self, *args, **kwargs):
@@ -165,8 +184,9 @@ def forward(self, x):
 
         mrm = MyReluMod()
         sym = NoLeafModulesTracer().trace(mrm)
-        for node in sym.graph.nodes:
+        for node in sym.nodes:
             self.assertNotEqual(node.op, 'call_module')
+        sym.lint(sym)
 
     def test_graph_edit_with_proxy(self):
         class M(torch.nn.Module):
@@ -174,12 +194,49 @@ def forward(self, a, b):
                 return a + b
         m = M()
         g = symbolic_trace(m).graph
-        t = Proxy(g.result)
+        new_g = torch.fx.Graph()
+        val_map : Dict[Node, Node] = {}
+        output_val = new_g.graph_copy(g, val_map)
+        t = Proxy(output_val)
         # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
-        g.output((t + t).node)
-        gm = GraphModule(m, g)
+        new_g.output((t + t).node)
+        gm = GraphModule(m, new_g)
+        gm.graph.lint(gm)
         self.assertEqual(gm(3, 4), 14)
 
+    def test_graph_unique_names(self):
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = M()
+        g = symbolic_trace(m).graph
+        new_g = torch.fx.Graph()
+        val_map : Dict[Node, Node] = {}
+        output_val = new_g.graph_copy(g, val_map)
+        t = Proxy(output_val)
+        # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
+        new_g.output((t + t).node)
+        gm = GraphModule(m, new_g)
+        seen_names : Set[str] = set()
+        for node in gm.graph.nodes:
+            assert node.name not in seen_names
+            seen_names.add(node.name)
+
+    def test_graph_unique_names_manual(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,), name='foo_1_1')
+        c : torch.fx.Node = graph.create_node('get_attr', 'y_attr', name='foo_1')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        graph2 = torch.fx.Graph()
+        val_map : Dict[Node, Node] = {}
+        graph2.graph_copy(graph, val_map)
+        seen_names : Set[str] = set()
+        for node in graph2.nodes:
+            assert node.name not in seen_names
+            seen_names.add(node.name)
+
     @skipIfNoTorchVision
     def test_resnet(self):
         resnet = resnet18()
@@ -202,6 +259,7 @@ def test_resnet(self):
             quantizer.observe((torch.rand(1, 3, 224, 224),))
 
         qgraph = quantizer.quantize()
+        qgraph.graph.lint(qgraph)
         qgraph_script = torch.jit.script(qgraph)
 
         d = qgraph(ip)
@@ -271,6 +329,7 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
                 operator.mul : "mul"
             }
 
+            output_node : Optional[Node] = None
             # For each instruction, create a triple
             # (instruction_name : str, inputs : List[str], output : str)
             # to feed into the C++ interpreter
@@ -297,9 +356,12 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
                         else:
                             arg_names.append(arg.name)
                     instructions.append((target_to_name[target], arg_names, out_name))
-
+                elif n.op == 'output':
+                    if output_node is not None:
+                        raise RuntimeError('Multiple output nodes!')
+                    output_node = n
                 else:
-                    raise RuntimeError('Unsupported opcode' + n.op)
+                    raise RuntimeError('Unsupported opcode ' + n.op)
 
             interpreter = torch.classes._TorchScriptTesting._ElementwiseInterpreter()
             # Load constants
@@ -310,7 +372,8 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
             # Load instructions
             interpreter.set_instructions(instructions)
             # Specify name for single output
-            interpreter.set_output_name(mod.graph.result.name)
+            assert isinstance(output_node.args[0], torch.fx.Node)
+            interpreter.set_output_name(output_node.args[0].name)
 
             # ===== Stage 3: Create a wrapper GraphModule around the interpreter =====
             class WrapperModule(torch.nn.Module):
@@ -345,6 +408,8 @@ def __init__(self, interpreter):
             # Register output
             graph.output(output_node)
 
+            graph.lint(wrapper)
+
             # Return final GraphModule!!!
             return GraphModule(wrapper, graph)
 
@@ -376,6 +441,7 @@ def forward(self, a):
 
         m = M()
         m_g = symbolic_trace(m)
+        m_g.graph.lint(m_g)
         for node in m_g.graph.nodes:
             self.assertTrue(node.name != "getattr")
 
@@ -392,7 +458,8 @@ def forward(self, a, b):
                 return a + b
 
         m = M()
-        g = TaggingTracer().trace(m).graph
+        g = TaggingTracer().trace(m)
+        g.lint(m)
         for n in g.nodes:
             self.assertTrue(hasattr(n, 'tag'))
             self.assertEqual(n.tag, 'foo')
@@ -420,6 +487,7 @@ def forward(self, x):
 
         wfq = WrapperForQualname()
         traced2 = symbolic_trace(wfq)
+        traced2.graph.lint(traced2)
         traced2(torch.rand(4, 4))
 
     def test_symbolic_trace_sequential(self):
@@ -433,6 +501,7 @@ def forward(self, x):
             Simple()
         )
         traced = symbolic_trace(seq)
+        traced.graph.lint(traced)
         x = torch.rand(3, 4)
         self.assertEqual(traced(x), seq(x))
 
@@ -443,6 +512,7 @@ def forward(self, x):
 
         ct = ConstTensor()
         traced = symbolic_trace(ct)
+        traced.graph.lint(traced)
         traced(torch.rand(4, 4))
 
     def test_pickle_graphmodule(self):
@@ -456,22 +526,28 @@ def forward(self, x):
 
         n = Nested()
         traced = symbolic_trace(n)
+        traced.graph.lint(traced)
         pickled = pickle.dumps(traced)
         loaded = pickle.loads(pickled)
+        loaded.graph.lint(loaded)
         x = torch.rand(3, 4)
         self.assertEqual(loaded(x), traced(x))
 
     def test_deepcopy_graphmodule_with_transform(self):
         st = SimpleTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
 
         def transform(traced):
-            new_graph = copy.deepcopy(traced.graph)
+            new_graph = torch.fx.Graph()
+            val_map : Dict[Node, Node] = {}
+            output_value = new_graph.graph_copy(traced.graph, val_map)
             relu_out = new_graph.create_node(
-                op='call_method', target='neg', args=(new_graph.result,), kwargs={})
+                op='call_method', target='neg', args=(output_value,), kwargs={})
             new_graph.output(relu_out)
             return GraphModule(traced, new_graph)
         transformed = transform(traced)
+        transformed.graph.lint(transformed)
         copied = copy.deepcopy(transformed)
         self.assertNotEqual(id(type(transformed)), id(type(copied)))
         x = torch.randn(3, 4)
@@ -497,7 +573,9 @@ def forward(self, x):
 
         baz = Baz()
         traced = symbolic_trace(baz)
+        traced.graph.lint(traced)
         copied = copy.deepcopy(traced)
+        copied.graph.lint(copied)
 
     def test_unpack_list_better_error(self):
         class SomeArgs(torch.nn.Module):
@@ -543,12 +621,32 @@ def forward(self, a):
         input = torch.randn(3)
         ref_out = m(input)
         gm = symbolic_trace(m)
+        gm.graph.lint(gm)
         out = gm(input)
         self.assertEqual(out, ref_out)
 
+    def test_replace_target_nodes_with(self):
+        class testModule(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = testModule()
+        traced = symbolic_trace(m)
+        input1 = torch.randn(1)
+        input2 = torch.randn(1)
+        assert (input1 + input2) == traced(input1, input2)
+        GraphManipulation.replace_target_nodes_with(
+            fx_module=traced,
+            old_op="call_function",
+            old_target=operator.add,
+            new_op="call_function",
+            new_target=operator.mul,
+        )
+        assert (input1 * input2) == traced(input1, input2)
+
     def test_pretty_print(self):
         st = SimpleTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
         printed = str(traced)
         assert 'GraphModuleImpl()' in printed
         assert 'torch.relu' in printed
@@ -559,6 +657,7 @@ def forward(self, x):
                 return torch.squeeze(x + 3.0, dim=2)
         st = KwargPrintTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
         stringed = str(traced.graph)
         for s in ['args', 'kwargs', 'uses']:
             assert s in stringed
@@ -575,6 +674,7 @@ def test_graph_fns(self):
         mod.linear = torch.nn.Linear(3, 4)
         mod.bias = torch.rand(4)
         gm = GraphModule(mod, g)
+        gm.graph.lint(gm)
         input = torch.rand(3)
         r = gm(input)
         ref = torch.sin(mod.linear(input) + mod.bias)
@@ -592,6 +692,7 @@ def test_construct_root_dict(self):
         add_param : torch.Tensor = torch.rand(3, 4)
         gm : torch.fx.GraphModule = torch.fx.GraphModule(
             {'foo.bar.baz': linear_mod, 'zip.zap.zam' : add_param}, graph)
+        gm.graph.lint(gm)
 
         assert 'self.foo.bar.baz' in gm.code
 
@@ -600,6 +701,252 @@ def test_construct_root_dict(self):
         ref_out : torch.Tensor = linear_mod(x) + add_param
         self.assertEqual(out, ref_out)
 
+    def test_symbolic_trace_assert(self):
+        message = "assert_foobar"
+
+        class AssertsTensorShape(torch.nn.Module):
+            def forward(self, x):
+                torch.Assert(x.shape[1] > 4, message)
+                return x
+
+        m = AssertsTensorShape()
+        # verify traceability
+        traced = symbolic_trace(m)
+        # verify assertion on traced model works correctly at runtime
+        traced(torch.rand(4, 5))
+        with self.assertRaisesRegex(AssertionError, message):
+            traced(torch.rand(4, 3))
+
+    def test_get_all_users_of(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,))
+        c : torch.fx.Node = graph.create_node('get_attr', 'y_attr')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        linear_mod : torch.nn.Module = torch.nn.Linear(3, 4)
+        add_param : torch.Tensor = torch.rand(3, 4)
+        gm : torch.fx.GraphModule = torch.fx.GraphModule(
+            {'linear_mod': linear_mod, 'y_attr' : add_param}, graph)
+        expected_uses: Dict[int, List[int]] = {
+            0: [1],
+            1: [3],
+            2: [3],
+            3: [4],
+            4: [],
+        }
+        for i, node in enumerate(graph.nodes):
+            user_indexes = GraphManipulation.get_all_users_of(gm, i)
+            assert user_indexes == expected_uses[i]
+
+    def test_copy_no_remap(self):
+        traced = symbolic_trace(SimpleTest())
+        g = traced.graph
+        copied = torch.fx.Graph()
+        for node in g.nodes:
+            copied.node_copy(node)
+        with self.assertRaisesRegex(RuntimeError, 'does not belong to this Graph'):
+            copied.lint()
+
+    def test_wrong_topo(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,))
+        c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        nodes = graph._nodes
+        nodes[2], nodes[3] = nodes[3], nodes[2]
+        with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
+            graph.lint()
+
+    def test_example_shape_prop(self):
+        class TestCase(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.randn(3, 4)
+                self.submod = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return torch.neg(self.submod(x.relu() + self.attr))
+        tc = TestCase()
+        tc_traced = symbolic_trace(tc)
+        ref_out = tc_traced(torch.rand(3, 4))
+        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
+
+        # Make sure we're testing all opcodes
+        opcodes = set()
+        output_shape : Optional[torch.Shape] = None
+        for node in tc_traced.graph.nodes:
+            opcodes.add(node.op)
+            if node.op == 'output':
+                output_shape = node.args[0].shape
+        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method',
+                                       'call_module', 'output']))
+
+        # Test shape propogation and make sure results match actual
+        self.assertEqual(output_shape, ref_out.shape)
+
+    def test_find_single_partition(self):
+        class testModule(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = testModule()
+        traced = symbolic_trace(m)
+        partitioner = Partitioner()
+        devices = [{"name": "dev_0", "available_mem": float('inf')}]
+        dag = partitioner.partition_graph(traced, devices)
+        for node in traced.graph.nodes:
+            assert node.op == 'output' or node.partition_ids == [1]
+        nodes = traced.graph.nodes
+        res_dag = DAG()
+        res_dag.create_node(0, [], [1], [], [])
+        res_dag.create_node(1, [0], [], [nodes[0], nodes[1]], [nodes[2]])
+        for r, d in zip(res_dag.nodes, dag.nodes):
+            assert(r.partition_id == d.partition_id)
+            assert(r.parents == d.parents)
+            assert(r.children == d.children)
+            assert(r.input_nodes == d.input_nodes)
+            assert(r.output_nodes == d.output_nodes)
+
+    def test_subgraph_creation(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 4))
+                self.linear = torch.nn.Linear(4, 5)
+
+            def forward(self, x, y):
+                z = self.linear(x + self.param).clamp(min=0.0, max=1.0)
+                w = self.linear(y).clamp(min=0.0, max=1.0)
+                return z + w
+
+        # symbolically trace model
+        my_module = MyModule()
+        my_module_traced = symbolic_trace(my_module)
+
+        # random mod partitioning
+        partition_counter = 0
+        NPARTITIONS = 3
+
+        def mod_partition(node: Node):
+            nonlocal partition_counter
+            partition = partition_counter % NPARTITIONS
+            partition_counter = (partition_counter + 1) % NPARTITIONS
+            return partition
+
+        # split module in module with submodules
+        module_with_submodules = split_module(my_module_traced, my_module, mod_partition)
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+
+        orig_out = my_module_traced(x, y)
+        submodules_out = module_with_submodules(x, y)
+
+        self.assertEqual(orig_out, submodules_out)
+
+    @skipIfNoTorchVision
+    def test_replace_uses(self):
+        rn18 = resnet18()
+
+        class LowerReluTracer(torch.fx.Tracer):
+            def is_leaf_module(self, m : torch.nn.Module, qualname : str):
+                if isinstance(m, torch.nn.ReLU):
+                    return False
+                return super().is_leaf_module(m, qualname)
+
+        rn18_traced = GraphModule(rn18, LowerReluTracer().trace(rn18))
+
+        to_erase = []
+        for node in rn18_traced.graph.nodes:
+            if node.op == 'call_function' and node.target in [torch.relu, torch.nn.functional.relu]:
+                kwargs = node.kwargs
+                # Neg doesn't have in-place
+                kwargs.pop('inplace')
+                with torch.fx.graph.insert_before(node):
+                    new_node = rn18_traced.graph.call_function(
+                        the_function=torch.neg, args=node.args, kwargs=node.kwargs)
+                node.replace_all_uses_with(replace_with=new_node)
+                to_erase.append(node)
+
+        for node in to_erase:
+            rn18_traced.graph.erase_node(node)
+
+    def test_insertion_point(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        with torch.fx.graph.insert_before(b):
+            neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+            _, *relu_args = b.args
+            b.args = (neg, *relu_args)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+
+    def test_move_before(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+        _, *relu_args = b.args
+        b.args = (neg, *relu_args)
+        graph.move_node_before(to_move=neg, before=b)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+    def test_erase_node_error(self):
+        st = SimpleTest()
+        traced = symbolic_trace(st)
+
+        for node in traced.graph.nodes:
+            # Test deleting with uses both in another Node and at the output
+            if node.target in [operator.add, torch.relu]:
+                with self.assertRaisesRegex(RuntimeError, 'but it still had .* uses in the graph!'):
+                    traced.graph.erase_node(node)
+
+    def test_find_uses(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+
+        y = torch.relu(x)
+        z = x + x
+        u = torch.neg(x)
+        graph.output((y + z + u).node)
+        graph.lint()
+
+        uses_of_x = x.node.find_uses()
+        self.assertEqual(len(uses_of_x), 3)
+        expected_ops = ['relu', 'add', 'neg']
+        for node, expected in zip(uses_of_x, expected_ops):
+            assert expected in node.name
+
+    def test_multi_insert_point(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+        relu = torch.relu(x)
+
+        with torch.fx.graph.insert_before(relu.node):
+            y = torch.neg(x)
+            z = torch.tanh(y)
+
+        graph.output((relu.node, z.node))
+        graph.lint()
+
+        expected_ops = ['x', 'neg', 'tanh', 'relu']
+        for node, expected in zip(graph.nodes, expected_ops):
+            assert expected in node.name
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index b689f76681f7..d093a4b8826e 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -32,6 +32,7 @@
 from jit.test_enum import TestEnum  # noqa: F401
 from jit.test_profiler import TestProfiler  # noqa: F401
 from jit.test_slice import TestSlice  # noqa: F401
+from jit.test_warn import TestWarn  # noqa: F401
 
 # Torch
 from torch import Tensor
@@ -1425,7 +1426,7 @@ def test_dropout(self):
             self.assertEqual(outputs, m(*inputs))
 
     @slowTest
-    @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, 'Testing differentiable graph')
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, 'Testing differentiable graph')
     def test_dropout_module_requires_grad(self):
         with enable_profiling_mode_for_profiling_tests():
             class MyModule(torch.nn.Module):
@@ -7205,6 +7206,20 @@ def f(x):
         x = torch.rand(3, 4)
         self.assertEqual(scripted_f(x), f(x))
 
+    def test_multiline_string_dedents(self):
+        def foo() -> None:
+            multiline_string_dedent_1 = """
+This is a string dedent """
+            multiline_string_dedent_2 = """ This is a
+  string dedent """
+            multiline_string_dedent_3 = """
+            This is a string
+dedent """
+            multiline_string_dedent_4 = """ This is a string dedent """
+
+        scripted_foo = torch.jit.script(foo)
+        self.assertEqual(scripted_foo(), foo())
+
     # adapted from test in test_torch
     def test_tensor_to(self):
         template = dedent('''
@@ -9995,6 +10010,21 @@ def method(self, x):
         with self.assertRaisesRegex(RuntimeError, "Argument y not provided."):
             ModuleDefault()
 
+    def test_type_inferred_from_empty_annotation(self):
+        """
+        Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true`
+        """
+        @torch.jit.script
+        def fn(x):
+            return x
+
+        graph = fn.graph
+        n = next(graph.inputs())
+        self.assertTrue(n.type() == torch._C.TensorType.getInferred())
+
+        with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"):
+            fn(1)
+
     def test_script_define_order(self):
         class M(torch.jit.ScriptModule):
 
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 9d61cd5dd157..ac9f054d38c8 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm
+from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm, TEST_WITH_ROCM
 from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
 
 from test_jit import JitTestCase, RUN_CUDA
@@ -550,9 +550,8 @@ def t(x: torch.Tensor, y: torch.Tensor):
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
         o = t(x, y)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP)
 
     # end-2-end test of permutation & contiguity handling in integration.
@@ -595,11 +594,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
         o = t(x, y)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            # numerical issues here due to our scheduling.
-            # can't use `self.assertEqual(oo, jit_oo)`
-            self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4))
+        self.assertEqual(o.dtype, jit_o.dtype)
+        # numerical issues here due to our scheduling.
+        # can't use `self.assertEqual(o, jit_o)`
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
@@ -630,6 +628,81 @@ def test_reduction_permutation(self):
                     for perm1 in itertools.permutations(range(len(x))):
                         self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    def test_reduction_multiple_output(self):
+        torch._C._jit_set_bailout_depth(2)
+
+        def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch.mul(o, scale)
+            out1 = torch.mul(o, z)
+            out2 = torch.sum(out1, dim=[2])
+            return out1, out2
+
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        scale = 0.5
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP)
+
+        x = x.to(memory_format=torch.channels_last)
+        y = y.to(memory_format=torch.channels_last)
+        z = z.to(memory_format=torch.channels_last)
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    @skipIfRocm
+    def test_reduction_dtype(self):
+        def t(x: torch.Tensor):
+            o = torch.mul(x, 1.0)
+            o = torch.sum(o, dim=[2], dtype=torch.float32)
+            return o
+        t_jit = torch.jit.script(t)
+
+        x = torch.randn(8, 4, 16, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    @skipIfRocm
+    def test_reduction_half(self):
+        def t(x: torch.Tensor):
+            o = torch.mul(x, 1.0)
+            o = torch.sum(o, dim=[2])
+            return o
+
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 4, 16, dtype=torch.float16, device="cuda")
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP)
+
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
                      ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
@@ -651,9 +724,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         jit_o = t_jit(x, y, z)
         jit_o = t_jit(x, y, z)
         o = t(x, y, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
@@ -676,9 +748,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         jit_o = t_jit(x, y, z)
         jit_o = t_jit(x, y, z)
         o = t(x, y, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP)
 
 
@@ -731,4 +802,5 @@ def test_register_fuser(self):
 
 
 if __name__ == '__main__':
-    run_tests()
+    if not TEST_WITH_ROCM and GRAPH_EXECUTOR != ProfilingMode.PROFILING:
+        run_tests()
diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py
index 4b9959c1231e..28ab78370637 100644
--- a/test/test_jit_cuda_fuser_legacy.py
+++ b/test/test_jit_cuda_fuser_legacy.py
@@ -1,5 +1,11 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
+
+import os
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0'
+
 from test_jit_cuda_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py
index e2869eca7b5f..5114ab190457 100644
--- a/test/test_jit_cuda_fuser_profiling.py
+++ b/test/test_jit_cuda_fuser_profiling.py
@@ -1,5 +1,11 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
+
+import os
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0'
+
 from test_jit_cuda_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index a75da03a6d21..b4efbf12c358 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -10,6 +10,7 @@
     RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, warmup_backward
 from textwrap import dedent
 from itertools import product, permutations
+from torch.testing._internal.common_cuda import with_tf32_off
 
 from test_jit import backward_graph, all_backward_graphs, get_lstm_inputs, get_milstm_inputs, \
     LSTMCellC, LSTMCellF, LSTMCellS, MiLSTMCell
@@ -710,6 +711,9 @@ def test_lstm_cuda(self):
                                                   "aten::_grad_sum_to_size"))
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_lstm_concat_cuda(self):
         inputs = get_lstm_inputs('cuda')
         ge = self.checkTrace(LSTMCellC, inputs)
@@ -740,6 +744,9 @@ def cell(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
 
     # TODO: Fuser doesn't work at all when inputs require grad. Fix that
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_lstm_traced_cuda(self):
         inputs = get_lstm_inputs('cuda')
         ge = self.checkTrace(LSTMCellF, inputs)
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index c33983e45e79..420075f6e611 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 453047eca8be..5cb43cbe8079 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 
+import operator
 import unittest
 import contextlib
 import torch
@@ -459,6 +460,121 @@ def func(x):
         graph = backward_graph(s, skip_check=True)
         self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'})
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_add_bool(self):
+        def f(x, y, z):
+            return x + y + z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_mul_bool(self):
+        def f(x, y, z):
+            return x * y * z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_div_bool(self):
+        def f(x, y, z):
+            return (x + y) / z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.ones_like(x, dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_bitwise_ops(self):
+        def apply(fn):
+            return lambda x, y, z: fn(fn(x, y), z)
+
+        dtypes = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+        binary_ops = [
+            operator.__and__,
+            operator.__or__,
+            operator.__xor__
+        ]
+        devices = ["cuda"]
+        for dtype, op, device in product(dtypes, binary_ops, devices):
+            try:
+                x = self.data_for(dtype, device)
+                y = self.data_for(dtype, device)
+                z = self.data_for(dtype, device)
+                fn = apply(op)
+                ref = fn(x, y, z)
+            except Exception:
+                # If eager mode doesn't support a dtype/op/device combo,
+                # neither does the fuser.  Catch everything to avoid needing to
+                # guess what errors might be thrown by eager.
+                continue
+            try:
+                t = torch.jit.trace(fn, (x, y, z))
+                self.assertEqual(ref, t(x, y, z))
+                self.assertAllFused(t.graph_for(x, y, z))
+            except Exception as e:
+                raise RuntimeError(
+                    " ".join(["Failed:", str(dtype), op.__name__, device])
+                )
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_minmax_int_ops(self):
+        def apply(fn):
+            return lambda x, y, z: fn(fn(x, y), z)
+
+        dtypes = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+        binary_ops = [
+            torch.min,
+            torch.max
+        ]
+        devices = ["cuda"]
+        for dtype, op, device in product(dtypes, binary_ops, devices):
+            try:
+                x = self.data_for(dtype, device)
+                y = self.data_for(dtype, device)
+                z = self.data_for(dtype, device)
+                fn = apply(op)
+                ref = fn(x, y, z)
+            except Exception:
+                # If eager mode doesn't support a dtype/op/device combo,
+                # neither does the fuser.  Catch everything to avoid needing to
+                # guess what errors might be thrown by eager.
+                continue
+            try:
+                t = torch.jit.trace(fn, (x, y, z))
+                self.assertEqual(ref, t(x, y, z))
+                self.assertAllFused(t.graph_for(x, y, z))
+            except Exception as e:
+                raise RuntimeError(
+                    " ".join(["Failed:", str(dtype), op.__name__, device])
+                )
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_comparison_eq_ne(self):
         def f(x, y):
@@ -567,6 +683,26 @@ def foo(hx, cx):
         # XXX: TE fuser can handle concats in a fusion group.
         # FileCheck().check("FusedConcat").check_next("return").run(str(graph))
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_remove_output_used_only_in_size(self):
+        def test_fuse(a, b):
+            c = a + b
+            d = c + b
+            return d
+
+        scripted_f = torch.jit.script(test_fuse)
+        x = torch.ones(1, requires_grad=True, device='cuda')
+        y = torch.ones(1, requires_grad=True, device='cuda')
+        warmup_forward(scripted_f, x, y)
+        g = torch.jit.last_executed_optimized_graph()
+        diff_nodes = [n for n in g.nodes() if n.kind() == 'prim::DifferentiableGraph']
+        self.assertEqual(len(diff_nodes), 1)
+        g = diff_nodes[0].g('Subgraph')
+        if_nodes = [n for n in g.nodes() if n.kind() == 'prim::If']
+        self.assertEqual(len(if_nodes), 1)
+        # the if node and the fusion group inside it should only have one output
+        self.assertEqual(len(list(if_nodes[0].outputs())), 1)
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_concat_invariant_cuda(self):
         # Invariant: the output of prim::FusedConcat may
@@ -1152,7 +1288,7 @@ def apply(fn):
             torch.int16,
             torch.int32,
             torch.int64,
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64,
             torch.bool,
@@ -1234,6 +1370,36 @@ def fn(x):
             self.assertEqual(ref, t(x))
             self.assertEqual(len(self.findFusionGroups(t.graph_for(x))), 0)
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_superslomo(self):
+        # Test extracted from Super-SloMo: https://github.com/avinashpaliwal/Super-SloMo
+        # A few interesting things happen here: strided inputs of mixed size,
+        # plus outputs of mixed shapes.  The latter characteristic happened to
+        # expose a memory corruption bug due to not properly guarding the
+        # outputs.
+        def eager(t0, t1, t2, t3, t4):
+            t5 = torch.mul(t0, t4)
+            t6 = torch.mul(t2, t3)
+            t7 = torch.mul(t6, t1)
+            t9 = torch.add(t5, t7)
+            t11 = torch.add(t0, t6)
+            ft_p = torch.div(t9, t11)
+            return (ft_p, t11, t9, t6)
+
+        t0 = torch.rand(1, 6, 352, 352, device="cuda").transpose(0, 1)
+        t1 = torch.rand(6, 3, 352, 352, device="cuda")
+        t2 = torch.rand(6, device="cuda")[None, None, None, :].permute(3, 0, 1, 2)
+        t3 = torch.rand(6, 1, 352, 352, device="cuda")
+        t4 = torch.rand(6, 3, 352, 352, device="cuda")
+        inputs = [t0, t1, t2, t3, t4]
+
+        script = torch.jit.script(eager)
+        for _ in range(4):
+            for pair in zip(script(*inputs), eager(*inputs)):
+                test, ref = pair
+                torch.testing.assert_allclose(test, ref)
+        self.assertAllFused(script.graph_for(*inputs))
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 2422e518a7f9..b17908e910bb 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index be02985e69a8..dc6bb2fbf878 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_py3.py b/test/test_jit_py3.py
index 4de5db884035..212b03d9658b 100644
--- a/test/test_jit_py3.py
+++ b/test/test_jit_py3.py
@@ -621,7 +621,7 @@ def if_function(inp: torch.Tensor) -> Any:
 
     def test_module_properties(self):
         class ModuleWithProperties(torch.nn.Module):
-            __ignored_properties__ = ["ignored_attr"]
+            __jit_unused_properties__ = ["ignored_attr"]
 
             def __init__(self, a: int):
                 super().__init__()
@@ -639,6 +639,15 @@ def attr(self):
             def ignored_attr(self):
                 return sum([self.a])
 
+            @torch.jit.unused
+            @property
+            def ignored_attr_2(self):
+                return sum([self.a])
+
+            @ignored_attr_2.setter
+            def ignored_attr_2(self, value):
+                self.a = sum([self.a])
+
             @attr.setter
             def attr(self, a: int):
                 if a > 0:
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 910e4a17713d..23da6602c572 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=simple")
+sys.argv.append("--jit_executor=simple")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_linalg.py b/test/test_linalg.py
index c81b4dc37582..97c7b926faf4 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1,13 +1,12 @@
 import torch
 import unittest
 import itertools
-import warnings
 from math import inf, nan, isnan
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, TEST_NUMPY)
+    (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack)
+    (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride)
 from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args
 from torch.autograd import gradcheck
 
@@ -17,21 +16,143 @@
 class TestLinalg(TestCase):
     exact_dtype = True
 
-    # TODO: test out variant
-    # Tests torch.ger, and its alias, torch.outer, vs. NumPy
+    # Tests torch.outer, and its alias, torch.ger, vs. NumPy
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    @dtypes(torch.float)
+    @precisionOverride({torch.bfloat16: 1e-1})
+    @dtypes(*(torch.testing.get_all_dtypes()))
     def test_outer(self, device, dtype):
-        a = torch.randn(50, device=device, dtype=dtype)
-        b = torch.randn(50, device=device, dtype=dtype)
+        def run_test_case(a, b):
+            if dtype == torch.bfloat16:
+                a_np = a.to(torch.double).cpu().numpy()
+                b_np = b.to(torch.double).cpu().numpy()
+            else:
+                a_np = a.cpu().numpy()
+                b_np = b.cpu().numpy()
+            expected = np.outer(a_np, b_np)
+
+            self.assertEqual(torch.outer(a, b), expected)
+            self.assertEqual(torch.Tensor.outer(a, b), expected)
+
+            self.assertEqual(torch.ger(a, b), expected)
+            self.assertEqual(torch.Tensor.ger(a, b), expected)
+
+            # test out variant
+            out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype)
+            torch.outer(a, b, out=out)
+            self.assertEqual(out, expected)
 
-        ops = (torch.ger, torch.Tensor.ger,
-               torch.outer, torch.Tensor.outer)
+            out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype)
+            torch.ger(a, b, out=out)
+            self.assertEqual(out, expected)
 
-        expected = np.outer(a.cpu().numpy(), b.cpu().numpy())
-        for op in ops:
-            actual = op(a, b)
-            self.assertEqual(actual, expected)
+        a = torch.randn(50).to(device=device, dtype=dtype)
+        b = torch.randn(50).to(device=device, dtype=dtype)
+        run_test_case(a, b)
+
+        # test 0 strided tensor
+        zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50)
+        run_test_case(zero_strided, b)
+        run_test_case(a, zero_strided)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    @precisionOverride({torch.bfloat16: 1e-1})
+    @dtypes(*(torch.testing.get_all_dtypes()))
+    def test_addr(self, device, dtype):
+        def run_test_case(m, a, b, beta=1, alpha=1):
+            if dtype == torch.bfloat16:
+                a_np = a.to(torch.double).cpu().numpy()
+                b_np = b.to(torch.double).cpu().numpy()
+                m_np = m.to(torch.double).cpu().numpy()
+            else:
+                a_np = a.cpu().numpy()
+                b_np = b.cpu().numpy()
+                m_np = m.cpu().numpy()
+
+            if beta == 0:
+                expected = alpha * np.outer(a_np, b_np)
+            else:
+                expected = beta * m_np + alpha * np.outer(a_np, b_np)
+
+            self.assertEqual(torch.addr(m, a, b, beta=beta, alpha=alpha), expected)
+            self.assertEqual(torch.Tensor.addr(m, a, b, beta=beta, alpha=alpha), expected)
+
+            result_dtype = torch.addr(m, a, b, beta=beta, alpha=alpha).dtype
+            out = torch.empty_like(m, dtype=result_dtype)
+            torch.addr(m, a, b, beta=beta, alpha=alpha, out=out)
+            self.assertEqual(out, expected)
+
+        a = torch.randn(50).to(device=device, dtype=dtype)
+        b = torch.randn(50).to(device=device, dtype=dtype)
+        m = torch.randn(50, 50).to(device=device, dtype=dtype)
+
+        # when beta is zero
+        run_test_case(m, a, b, beta=0., alpha=2)
+
+        # when beta is not zero
+        run_test_case(m, a, b, beta=0.5, alpha=2)
+
+        # test transpose
+        m_transpose = torch.transpose(m, 0, 1)
+        run_test_case(m_transpose, a, b, beta=0.5, alpha=2)
+
+        # test 0 strided tensor
+        zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50)
+        run_test_case(m, zero_strided, b, beta=0.5, alpha=2)
+
+        # test scalar
+        m_scalar = torch.tensor(1, device=device, dtype=dtype)
+        run_test_case(m_scalar, a, b)
+
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
+                               torch.testing.get_all_dtypes()))
+    def test_outer_type_promotion(self, device, dtypes):
+        a = torch.randn(5).to(device=device, dtype=dtypes[0])
+        b = torch.randn(5).to(device=device, dtype=dtypes[1])
+        for op in (torch.outer, torch.Tensor.outer, torch.ger, torch.Tensor.ger):
+            result = op(a, b)
+            self.assertEqual(result.dtype, torch.result_type(a, b))
+
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
+                               torch.testing.get_all_dtypes()))
+    def test_addr_type_promotion(self, device, dtypes):
+        a = torch.randn(5).to(device=device, dtype=dtypes[0])
+        b = torch.randn(5).to(device=device, dtype=dtypes[1])
+        m = torch.randn(5, 5).to(device=device,
+                                 dtype=torch.result_type(a, b))
+        for op in (torch.addr, torch.Tensor.addr):
+            # pass the integer 1 to the torch.result_type as both
+            # the default values of alpha and beta are integers (alpha=1, beta=1)
+            desired_dtype = torch.result_type(m, 1)
+            result = op(m, a, b)
+            self.assertEqual(result.dtype, desired_dtype)
+
+            desired_dtype = torch.result_type(m, 2.)
+            result = op(m, a, b, beta=0, alpha=2.)
+            self.assertEqual(result.dtype, desired_dtype)
+
+    # Tests migrated from test_torch.py
+    # 1) test the shape of the result tensor when there is empty input tensor
+    # 2) test the Runtime Exception when there is scalar input tensor
+    def test_outer_ger_addr_legacy_tests(self, device):
+        for size in ((0, 0), (0, 5), (5, 0)):
+            a = torch.rand(size[0], device=device)
+            b = torch.rand(size[1], device=device)
+
+            self.assertEqual(torch.outer(a, b).shape, size)
+            self.assertEqual(torch.ger(a, b).shape, size)
+
+            m = torch.empty(size, device=device)
+            self.assertEqual(torch.addr(m, a, b).shape, size)
+
+        m = torch.randn(5, 6, device=device)
+        a = torch.randn(5, device=device)
+        b = torch.tensor(6, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.outer(a, b))
+        self.assertRaises(RuntimeError, lambda: torch.outer(b, a))
+        self.assertRaises(RuntimeError, lambda: torch.ger(a, b))
+        self.assertRaises(RuntimeError, lambda: torch.ger(b, a))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, a, b))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, b, a))
 
     # Tests torch.det and its alias, torch.linalg.det, vs. NumPy
     @skipCUDAIfNoMagma
@@ -56,11 +177,12 @@ def test_det(self, device, dtype):
 
         # NOTE: det requires a 2D+ tensor
         t = torch.randn(1, device=device, dtype=dtype)
-        with self.assertRaises(IndexError):
+        with self.assertRaises(RuntimeError):
             op(t)
 
     # This test confirms that torch.linalg.norm's dtype argument works
     # as expected, according to the function's documentation
+    @skipCUDAIfNoMagma
     def test_norm_dtype(self, device):
         def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype):
             msg = (
@@ -154,6 +276,7 @@ def run_test_case(input, p, dim, keepdim):
 
     # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that
     # their matrix norm results match
+    @skipCUDAIfNoMagma
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @dtypes(torch.float, torch.double)
     def test_norm_matrix(self, device, dtype):
@@ -354,9 +477,6 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
             unsupported_matrix_ords = [
                 (None, r'norm with p=2 not supported for complex tensors'),
                 ('fro', r'frobenius norm not supported for complex tensors'),
-                (2, r'"svd_cuda" not implemented for \'Complex'),
-                (-2, r'"svd_cuda" not implemented for \'Complex'),
-                ('nuc', r'"svd_cuda" not implemented for \'Complex'),
             ]
 
         # Test supported ords
@@ -400,6 +520,8 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
 
     # Test that linal.norm gives the same result as numpy when inputs
     # contain extreme values (inf, -inf, nan)
+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @unittest.skipIf(IS_MACOS, "Skipped on MacOS!")
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@@ -440,14 +562,14 @@ def is_broken_matrix_norm_case(ord, x):
                 result_n = np.linalg.norm(x_n, ord=ord)
 
                 if is_broken_matrix_norm_case(ord, x):
-                    self.assertNotEqual(result, result_n, msg=msg)
+                    continue
                 else:
                     self.assertEqual(result, result_n, msg=msg)
 
     # Test degenerate shape results match numpy for linalg.norm vector norms
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped on ASAN since it checks for undefined behavior.")
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_norm_vector_degenerate_shapes(self, device, dtype):
         def run_test_case(input, ord, dim, keepdim, should_error):
@@ -528,18 +650,6 @@ def run_test_case(input, ord, dim, keepdim, should_error):
                 for ord in ord_matrix:
                     run_test_case(input, ord, dim, keepdim, ord in error_ords)
 
-    def test_norm_deprecated(self, device):
-        expected_message = (
-            r'torch.norm is deprecated and may be removed in a future PyTorch release. '
-            r'Use torch.linalg.norm instead.')
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            for func in [torch.norm, torch.functional.norm]:
-                func(torch.rand(10, device=device))
-        self.assertEqual(len(w), 2)
-        for wi in w:
-            self.assertEqual(str(wi.message), expected_message)
-
     def test_norm_fastpaths(self, device):
         x = torch.randn(3, 5, device=device)
 
diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index eae6175fb024..11235edac7c0 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -100,8 +100,8 @@ def forward(self, x):
         torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
 
 
-        optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
-        optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blacklist_no_prepack)
+        optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
+        optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blocklist_no_prepack)
         optimized_result_no_prepack = optimized_scripted_model_no_prepack(input_data)
 
         FileCheck().check_count("Tensor = aten::conv2d", 1, exactly=True) \
@@ -118,19 +118,36 @@ def forward(self, x):
         FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \
                    .run(str(get_forward(bn_scripted_module._c).graph))
 
-        optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
-        bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_prepack)
+        optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
+        bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_prepack)
         self.assertEqual(len(torch.jit.export_opnames(bn_fold_scripted_module)), 1)
         bn_input = torch.rand(1, 1, 6, 6)
         torch.testing.assert_allclose(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
-        optimization_blacklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION}
-        no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_fold_bn)
+        optimization_blocklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION}
+        no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_fold_bn)
         FileCheck().check_count("aten::batch_norm", 1, exactly=True) \
                    .run(str(get_forward_graph(no_bn_fold_scripted_module._c)))
         bn_input = torch.rand(1, 1, 6, 6)
         torch.testing.assert_allclose(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
+        class MyMobileOptimizedTagTest(torch.nn.Module):
+            def __init__(self):
+                super(MyMobileOptimizedTagTest, self).__init__()
+                self.linear_weight = torch.nn.Parameter(torch.Tensor(torch.rand(linear_weight_shape)))
+                self.linear_bias = torch.nn.Parameter(torch.Tensor(torch.rand((weight_output_dim))))
+
+            def forward(self, x):
+                o = F.linear(x, self.linear_weight, self.linear_bias)
+                return F.relu(o)
+
+        mobile_optimized_tag_module = MyMobileOptimizedTagTest()
+        m = torch.jit.script(mobile_optimized_tag_module)
+        m.eval()
+        opt_m = optimize_for_mobile(m)
+        tag = getattr(opt_m, "mobile_optimized", None)
+        self.assertTrue(tag)
+
         class MyPreserveMethodsTest(torch.nn.Module):
             def __init__(self):
                 super(MyPreserveMethodsTest, self).__init__()
diff --git a/test/test_nn.py b/test/test_nn.py
index 07070d0e550b..7e74d0719eb4 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -33,6 +33,7 @@
 from torch.autograd.gradcheck import gradgradcheck
 from torch.nn import Parameter
 from torch.nn.parallel._functions import Broadcast
+from torch.testing import get_all_fp_dtypes
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
     get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \
@@ -52,6 +53,10 @@
 from torch.testing._internal.common_utils import _assertGradAndGradgradChecks
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on
+from torch.types import _TensorOrTensors
+
+
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -312,15 +317,19 @@ class TestNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    def _forward(self, module, input):
+    def _forward(self, module, input: _TensorOrTensors):
         with freeze_rng_state():
-            return module(input)
+            if isinstance(input, tuple):
+                return module(*input)
+            else:
+                return module(input)
 
-    def _backward(self, module, input, output, grad_output, create_graph=False):
+    def _backward(self, module, input: _TensorOrTensors, output, grad_output, create_graph=False):
         output.backward(grad_output, retain_graph=True, create_graph=create_graph)
-        if input.grad is None:
-            return None
-        return input.grad.data
+        if isinstance(input, tuple):
+            return tuple(map(lambda i: i.grad.data if i.grad is not None else None, input))
+        else:
+            return input.grad.data if input.grad is not None else None
 
     def _forward_criterion(self, criterion, input, target, extra_args=None):
         if extra_args is None:
@@ -2646,6 +2655,19 @@ def test_weight_norm(self):
             m = torch.nn.utils.weight_norm(m)
             m = torch.nn.utils.weight_norm(m)
 
+    def test_parameterlistdict_setting_attributes(self):
+        mod = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)]))
+
+        with self.assertWarnsRegex(UserWarning,
+                                   r"Setting attributes on ParameterList is not supported"):
+            torch.nn.utils.weight_norm(mod, "0")
+
+        mod = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))})
+
+        with self.assertWarnsRegex(UserWarning,
+                                   r"Setting attributes on ParameterDict is not supported"):
+            torch.nn.utils.weight_norm(mod, "b")
+
     def test_weight_norm_pickle(self):
         m = torch.nn.utils.weight_norm(nn.Linear(5, 7))
         m = pickle.loads(pickle.dumps(m))
@@ -3910,6 +3932,15 @@ def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
 
+    def test_Conv2d_1x1(self):
+        in_channels = 2
+        out_channels = 2
+        mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
+        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
+        for enabled in (False, True):
+            with torch.backends.mkldnn.flags(enabled=enabled):
+                gradcheck(F.conv2d, (input, mod.weight))
+
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     def test_cudnn_non_contiguous(self):
@@ -3939,7 +3970,7 @@ def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_Conv2d_deterministic_cudnn(self, dtype=torch.float):
         inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
@@ -3969,7 +4000,7 @@ def test_Conv2d_backward_twice(self):
                                lambda: o1.sum().backward())
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_Conv2d_large_workspace(self, dtype=torch.float):
         # These sizes require huge cuDNN workspaces. Make sure we choose a
         # reasonable algorithm that does not run out of memory
@@ -4096,7 +4127,7 @@ def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
             dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if TEST_WITH_ROCM:
+        if AMPERE_OR_ROCM:
             dev_dtypes += [("cuda", torch.bfloat16)]
         for device, dtype in dev_dtypes:
             m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
@@ -4134,7 +4165,7 @@ def test_Conv2d_groups_nobias_v2(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
             dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if TEST_WITH_ROCM:
+        if AMPERE_OR_ROCM:
             dev_dtypes += [("cuda", torch.bfloat16)]
         for device, dtype in dev_dtypes:
             m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
@@ -5147,6 +5178,493 @@ def test_transformerdecoderlayer_gelu(self):
         self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
         torch.testing.assert_allclose(result, ref_output)
 
+    def test_transformerencoder(self):
+        def get_a_test_layer(use_cuda, activation):
+            d_model = 4
+            nhead = 2
+            dim_feedforward = 16
+            dropout = 0.0
+            device = torch.device("cuda" if use_cuda else "cpu")
+
+            layer = nn.TransformerEncoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation=activation).to(device)
+
+            with torch.no_grad():
+                # set constant weights of the model
+                for idx, p in enumerate(layer.parameters()):
+                    x = p.data
+                    sz = x.view(-1).size(0)
+                    shape = x.shape
+                    x = torch.cos(torch.arange(0, sz).float().view(shape))
+                    p.data.copy_(x)
+
+            return layer
+
+        # this is a deterministic test for TransformerEncoder
+        activation = "relu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        encoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerEncoder(encoder_layer, 1).to(device)
+
+        # deterministic input
+        encoder_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                       [0.5387, 0.1655, 0.3565, 0.0471]],
+                                      [[0.8335, 0.2799, 0.5031, 0.2947],
+                                       [0.1402, 0.0318, 0.7636, 0.1346]],
+                                      [[0.6333, 0.9344, 0.1376, 0.9938],
+                                       [0.8924, 0.2872, 0.6692, 0.2944]],
+                                      [[0.9897, 0.6915, 0.3154, 0.1733],
+                                       [0.8645, 0.3513, 0.3064, 0.0767]],
+                                      [[0.8117, 0.2366, 0.4838, 0.7881],
+                                       [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                     ).to(device)
+        result = model(encoder_input)
+        ref_output = torch.Tensor([[[2.428589, 0.020835, -0.602055, -0.085249],
+                                    [2.427987, 0.021213, -0.602496, -0.084103]],
+                                   [[2.424689, 0.019155, -0.604793, -0.085672],
+                                    [2.413863, 0.022211, -0.612486, -0.072490]],
+                                   [[2.433774, 0.021598, -0.598343, -0.087548],
+                                    [2.425104, 0.019748, -0.604515, -0.084839]],
+                                   [[2.436185, 0.022682, -0.596625, -0.087261],
+                                    [2.433556, 0.021891, -0.598509, -0.086832]],
+                                   [[2.416246, 0.017512, -0.610712, -0.082961],
+                                    [2.422901, 0.024187, -0.606178, -0.074929]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # all 0
+        mask = torch.zeros([2, 5]).to(device) == 1
+        result = model(encoder_input, src_key_padding_mask=mask)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+        mask[0, 1] = 1
+        mask[1, 3] = 1
+        mask[1, 4] = 1
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor([[[2.429026, 0.020793, -0.601741, -0.085642],
+                                    [2.428811, 0.021445, -0.601912, -0.084252]],
+                                   [[2.425009, 0.019155, -0.604566, -0.085899],
+                                    [2.415408, 0.02249, -0.611415, -0.073]],
+                                   [[2.434199, 0.021682, -0.598039, -0.087699],
+                                    [2.42598, 0.019941, -0.603896, -0.085091]],
+                                   [[2.436457, 0.022736, -0.59643, -0.08736],
+                                    [2.434021, 0.022093, -0.598179, -0.08679]],
+                                   [[2.416531, 0.017498, -0.610513, -0.083181],
+                                    [2.4242, 0.024653, -0.605266, -0.074959]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # test case 2, multiple layers no norm
+        model = nn.TransformerEncoder(encoder_layer, 2).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[2.419051, 0.017446, -0.608738, -0.085003],
+              [2.419102, 0.017452, -0.608703, -0.085026]],
+             [[2.419043, 0.017445, -0.608744, -0.084999],
+              [2.419052, 0.017446, -0.608738, -0.085004]],
+             [[2.419067, 0.017448, -0.608727, -0.085010],
+              [2.419098, 0.017452, -0.608706, -0.085024]],
+             [[2.419072, 0.017449, -0.608724, -0.085012],
+              [2.419119, 0.017455, -0.608691, -0.085034]],
+             [[2.419019, 0.017442, -0.608761, -0.084989],
+              [2.419075, 0.017449, -0.608722, -0.085014]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        model = nn.TransformerEncoder(encoder_layer, 6).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # test case 3, multiple layers with norm
+        # d_model = 4
+        norm = nn.LayerNorm(4)
+        model = nn.TransformerEncoder(encoder_layer, 2, norm=norm).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[1.695949, -0.357635, -0.893077, -0.445238],
+              [1.695955, -0.357639, -0.893050, -0.445266]],
+             [[1.695948, -0.357634, -0.893082, -0.445233],
+              [1.695950, -0.357635, -0.893077, -0.445238]],
+             [[1.695951, -0.357636, -0.893069, -0.445246],
+              [1.695955, -0.357639, -0.893052, -0.445264]],
+             [[1.695952, -0.357636, -0.893066, -0.445249],
+              [1.695957, -0.357641, -0.893041, -0.445276]],
+             [[1.695946, -0.357632, -0.893095, -0.445220],
+              [1.695952, -0.357637, -0.893065, -0.445251]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+
+    def test_transformerdecoder(self):
+        def get_a_test_layer(use_cuda, activation):
+            d_model = 4
+            nhead = 2
+            dim_feedforward = 16
+            dropout = 0.0
+            device = torch.device("cuda" if use_cuda else "cpu")
+
+            layer = nn.TransformerDecoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation=activation).to(device)
+
+            with torch.no_grad():
+                # set constant weights of the model
+                for idx, p in enumerate(layer.parameters()):
+                    x = p.data
+                    sz = x.view(-1).size(0)
+                    shape = x.shape
+                    x = torch.cos(torch.arange(0, sz).float().view(shape))
+                    p.data.copy_(x)
+
+            return layer
+
+        # this is a deterministic test for TransformerDecoder
+        activation = "relu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerDecoder(decoder_layer, 1).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.422245, 0.051716, -0.606338, -0.024756]],
+             [[2.422245, 0.051716, -0.606338, -0.024756]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[1, 2, 3, 4]],
+                                      [[5, 6, 7, 8]]]).to(device)
+        memory_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.343536, 0.085561, -0.654954, 0.074991]],
+             [[2.343536, 0.085561, -0.654954, 0.074991]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # key_padding_mask
+        key_padding_mask = torch.zeros(2, 3).to(device) == 1
+        result = model(decoder_input,
+                       memory_input,
+                       tgt_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # key_padding_mask
+        key_padding_mask[0, 2] = 1
+        key_padding_mask[1, 1] = 1
+        key_padding_mask[1, 2] = 1
+        result = model(decoder_input,
+                       memory_input,
+                       tgt_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430025, 0.027643, -0.601164, -0.073476],
+                                    [2.4323, 0.029375, -0.599553, -0.071881]],
+                                   [[2.428523, 0.026838, -0.602226, -0.07391],
+                                    [2.432634, 0.029842, -0.599318, -0.071253]],
+                                   [[2.432278, 0.028152, -0.599555, -0.074139],
+                                    [2.432659, 0.029244, -0.599294, -0.072382]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # memory_key_padding_mask
+        key_padding_mask = torch.zeros(2, 5).to(device) == 1
+        result = model(decoder_input,
+                       memory_input,
+                       memory_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # memory_key_padding_mask
+        key_padding_mask[0, 4] = 1
+        key_padding_mask[1, 3] = 1
+        key_padding_mask[1, 4] = 1
+        result = model(decoder_input,
+                       memory_input,
+                       memory_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.429757, 0.027358, -0.601351, -0.073816],
+                                    [2.432692, 0.028583, -0.599263, -0.073634]],
+                                   [[2.428247, 0.02662, -0.602419, -0.074123],
+                                    [2.432657, 0.029055, -0.599293, -0.072732]],
+                                   [[2.431515, 0.027687, -0.600096, -0.074459],
+                                    [2.433075, 0.028543, -0.598987, -0.073985]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers no norm
+        model = nn.TransformerDecoder(decoder_layer, 2).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers no norm
+        model = nn.TransformerDecoder(decoder_layer, 6).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                      [0.2678, 0.3677, 0.4459, 0.7166]],
+                                     [[0.8100, 0.3716, 0.4096, 0.1976],
+                                      [0.6958, 0.8844, 0.6081, 0.8315]],
+                                     [[0.0494, 0.9343, 0.5955, 0.3830],
+                                      [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]],
+             [[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]],
+             [[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers with norm
+        # d_model = 4
+        norm = nn.LayerNorm(4)
+        model = nn.TransformerDecoder(decoder_layer, 2, norm=norm).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers with norm
+        model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]],
+             [[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]],
+             [[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # gelu activation test cases
+        activation = "gelu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerDecoder(decoder_layer, 1).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[9, 10, 11, 12]],
+                                      [[11, 12, 13, 14]]]).to(device)
+        memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.415448, 0.054389, -0.610932, -0.0156613]],
+             [[2.415448, 0.054389, -0.610932, -0.0156613]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[1, 2, 3, 4]],
+                                      [[5, 6, 7, 8]]]).to(device)
+        memory_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.338531, 0.087709, -0.65776, 0.080646]],
+             [[2.338531, 0.087709, -0.65776, 0.080646]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.42049104, 0.03443088, -0.60793706, -0.05436271],
+              [2.42210631, 0.03546578, -0.60679895, -0.05357488]],
+             [[2.41907674, 0.0336104, -0.60892977, -0.05490462],
+              [2.42216881, 0.03586554, -0.6067524, -0.05289126]],
+             [[2.42205716, 0.03488046, -0.60683681, -0.05460596],
+              [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+
     @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available')
     def test_cudnn_rnn_dropout_states_device(self):
         rnn = nn.RNN(10, 20, num_layers=2, dropout=.5)
@@ -5880,7 +6398,7 @@ def test_inplace_thnn(self):
             self.assertEqual(grad_output, grad_output_clone)
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_noncontig_conv_grad_cuda(self, dtype=torch.float):
         # FIXME: remove after adding non-contiguous grad tests for all modules
         module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype)
@@ -6523,6 +7041,10 @@ def test_l1_loss_correct(self):
                 torch.nn.L1Loss()(input, torch.zeros_like(input)),
                 input.abs().mean())
 
+    def test_smoothl1loss_negative_beta_not_supported(self):
+        with self.assertRaises(RuntimeError):
+            F.smooth_l1_loss(torch.randn(2, 2), torch.randn(2, 2), beta=-1.0)
+
     def test_cosine_similarity(self):
         input1 = torch.randn(4, 4, requires_grad=True)
         input2 = torch.randn(4, 4, requires_grad=True)
@@ -8628,6 +9150,18 @@ def test_fuse_module_eval_numerics(self, X, running_mean, running_var):
 
         self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off")
 
+        na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False)
+        na_bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double)
+        na_bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double)
+        na_bn_ref.eval()
+
+        Y_ref = na_bn_ref(conv_ref(inputs))
+        conv_na_bn_fused = torch.nn.utils.fusion.fuse_conv_bn_eval(conv_ref,
+                                                                   na_bn_ref)
+        Y_hat = conv_na_bn_fused(inputs)
+
+        self.assertEqual(Y_ref, Y_hat, msg="Conv+BN(non-affine) fusion results are off")
+
 
 class TestAddRelu(TestCase):
     def test_add_relu(self):
@@ -9853,6 +10387,7 @@ def v(fn):
             v(lambda: F.multilabel_margin_loss(input, zeros, reduction=reduction))
 
             v(lambda: F.triplet_margin_loss(input, input, input, reduction=reduction))
+            v(lambda: F.triplet_margin_with_distance_loss(input, input, input, reduction=reduction))
             v(lambda: F.margin_ranking_loss(input, input, input.sign(), reduction=reduction))
             v(lambda: F.cosine_embedding_loss(input, input, input[:, 0].sign(), reduction=reduction))
 
@@ -10683,6 +11218,63 @@ def test_contig_wrong_stride_cudnn(self, device):
         F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
         F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
 
+    @onlyCUDA
+    def test_Conv2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose3d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
     def _ordered_sequence(self, device, dtype):
         """Create ordered list of random sequences"""
         seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype)
@@ -11287,7 +11879,7 @@ def test_multihead_attention_dtype(self, device, dtype):
         self.assertEqual(q.size(), out[0].size())
         self.assertEqual(dtype, out[0].dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     @dtypes(torch.float)
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
@@ -11537,32 +12129,32 @@ def expected_output(dim):
         self.assertEqual(output[0, 0, 0, 0], float("-inf"))
         self.assertEqual(indices[0, 0, 0, 0], 0)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype)
@@ -11635,7 +12227,7 @@ def test_pooling_zero_stride(self, device):
                 self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
                                        lambda: fn_module(x))
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_pool_large_size(self, device, dtype):
         for op in ('max', 'avg'):
@@ -11649,7 +12241,7 @@ def test_pool_large_size(self, device, dtype):
                 # check if the output shape was still computed correctly
                 self.assertEqual(x.shape[2], res.shape[2])
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_pool_invalid_size(self, device, dtype):
         for op in ('max', 'avg'):
@@ -11945,6 +12537,7 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
     @onlyCUDA
     @skipCUDAIfRocm
     @skipCUDAIfCudnnVersionLessThan(7603)
+    @tf32_on_and_off(0.05)
     def test_conv_cudnn_mismatch_memory_format(self, device):
         configs = [
             [4, 2, 8, 8, 4, 2],
@@ -12171,6 +12764,85 @@ def test_threshold_inplace_overlap(self, device):
         F.threshold(x, 0.5, 0.5, inplace=True)
         F.threshold_(x, 0.5, 0.5)
 
+    @onlyOnCPUAndCUDA
+    def test_triplet_margin_with_distance_loss_default_parity(self, device):
+        # Test for `nn.TripletMarginWithDistanceLoss` and
+        # `F.triplet_margin_with_distance_loss`.  Checks
+        # for parity against the respective non-distance-agnostic
+        # implementations of triplet margin loss (``nn.TripletMarginLoss`
+        # and `F.triplet_margin_loss`) under *default args*.
+
+        for extra_args in \
+                itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')):
+            kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]}
+
+            anchor = torch.randn(5, 10, device=device, requires_grad=True)
+            positive = torch.randn(5, 10, device=device, requires_grad=True)
+            negative = torch.randn(5, 10, device=device, requires_grad=True)
+
+            # Test forward, functional
+            expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs)
+            actual = F.triplet_margin_with_distance_loss(anchor, positive, negative, **kwargs)
+            self.assertEqual(actual, expected, rtol=1e-6, atol=1e-6)
+
+            # Test forward, module
+            loss_ref = nn.TripletMarginLoss(**kwargs)
+            loss_op = nn.TripletMarginWithDistanceLoss(**kwargs)
+            self.assertEqual(loss_op(anchor, positive, negative),
+                             loss_ref(anchor, positive, negative),
+                             rtol=1e-6, atol=1e-6)
+
+            # Test backward
+            self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss(
+                a, p, n, **kwargs), (anchor, positive, negative)))
+            self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n),
+                            (anchor, positive, negative)))
+
+    @onlyOnCPUAndCUDA
+    def test_triplet_margin_with_distance_loss(self, device):
+        # Test for parity between `nn.TripletMarginWithDistanceLoss` and
+        # `F.triplet_margin_with_distance_loss`.
+
+        pairwise_distance = nn.PairwiseDistance()
+
+        def cosine_distance(x, y):
+            return 1.0 - F.cosine_similarity(x, y)
+
+        distance_functions = (pairwise_distance, cosine_distance,
+                              lambda x, y: 1.0 - F.cosine_similarity(x, y))
+
+        reductions = ('mean', 'none', 'sum')
+        margins = (1.0, 1.5, 0.5)
+        swaps = (True, False)
+
+        for distance_fn, reduction, margin, swap \
+                in itertools.product(distance_functions, reductions, margins, swaps):
+            anchor = torch.randn(5, 10, device=device, requires_grad=True)
+            positive = torch.randn(5, 10, device=device, requires_grad=True)
+            negative = torch.randn(5, 10, device=device, requires_grad=True)
+
+            # Test backward
+            self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss(
+                a, p, n, distance_function=distance_fn, reduction=reduction, margin=margin, swap=swap),
+                (anchor, positive, negative)))
+            loss_op = nn.TripletMarginWithDistanceLoss(distance_function=distance_fn,
+                                                       reduction=reduction, margin=margin, swap=swap)
+            self.assertTrue(gradcheck(lambda a, p, n: loss_op(
+                a, p, n), (anchor, positive, negative)))
+            traced_loss_op = torch.jit.trace(loss_op, (anchor, positive, negative))
+            self.assertTrue(gradcheck(lambda a, p, n: traced_loss_op(
+                a, p, n), (anchor, positive, negative)))
+
+            # Test forward parity
+            functional = F.triplet_margin_with_distance_loss(anchor, positive, negative,
+                                                             distance_function=distance_fn,
+                                                             reduction=reduction, margin=margin, swap=swap)
+            modular = loss_op(anchor, positive, negative)
+            traced = traced_loss_op(anchor, positive, negative)
+            self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6)
+            self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6)
+
+
 class TestModuleGlobalHooks(TestCase):
 
     def tearDown(self):
diff --git a/test/test_op_aliases.py b/test/test_op_aliases.py
index 7ad691328c4b..8a106d7860d1 100644
--- a/test/test_op_aliases.py
+++ b/test/test_op_aliases.py
@@ -45,7 +45,7 @@ def __init__(self,
               decorators=(skipCPUIfNoLapack, skipCUDAIfNoMagma)),
     # NOTE: only runs on CPU because it leaks CUDA memory
     #   (see https://github.com/pytorch/pytorch/issues/43119)
-    AliasInfo('outer', torch.outer, 'ger', torch.ger,
+    AliasInfo('ger', torch.ger, 'outer', torch.outer,
               lambda d: torch.randn(20, device=d), get_args=lambda d: (torch.randn(20, device=d),),
               decorators=(onlyCPU,)),
     AliasInfo('arccosh', torch.arccosh, 'acosh', torch.acosh,
diff --git a/test/test_ops.py b/test/test_ops.py
index 28570d9892ab..5be450d4d41f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -131,8 +131,31 @@ def test_inplace_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
 
 
+class TestOut(TestCase):
+    exact_dtype = True
+
+    @ops(op_db)
+    def test_out(self, device, dtype, op):
+        if not op.supports_tensor_out:
+            self.skipTest("Skipped! Operator %s does not support out=..." % op.name)
+        samples = op.sample_inputs(device, dtype)
+        if len(samples) == 0:
+            self.skipTest("Skipped! No sample inputs!")
+
+        # NOTE: only tests on first sample
+        sample = samples[0]
+        # call it normally to get the expected result
+        expected = op(sample.input, *sample.args, **sample.kwargs)
+        # call it with out=... and check we get the expected result
+        out_kwargs = sample.kwargs.copy()
+        out_kwargs['out'] = out = torch.empty_like(expected)
+        op(sample.input, *sample.args, **out_kwargs)
+        self.assertEqual(expected, out)
+
+
 instantiate_device_type_tests(TestOpInfo, globals())
 instantiate_device_type_tests(TestGradients, globals())
+instantiate_device_type_tests(TestOut, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_optim.py b/test/test_optim.py
index b00184cc9343..3e3e6610fa01 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,6 +6,7 @@
 import torch
 from torch._six import inf
 import torch.optim as optim
+import torch.optim._multi_tensor as optim_mt
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@@ -249,105 +250,199 @@ def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
-             lambda opt: ExponentialLR(opt, gamma=0.99),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
-            optim.SGD(None, lr=1e-2, momentum=-0.5)
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+                 lambda opt: ExponentialLR(opt, gamma=0.99),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1, weight_decay=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], nesterov=True, lr=1e-3, momentum=1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
+                optimizer(None, lr=1e-2, momentum=-0.5)
 
     def test_sgd_sparse(self):
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=5e-3)
-        )
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=0.005),
-            [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
-        )
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=5e-3)
+            )
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=0.005),
+                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+            )
 
-    def test_adam(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
-                                            amsgrad=True)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3),
-            [lambda opt: ExponentialLR(opt, gamma=0.9)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
-                                            amsgrad=True),
-            [lambda opt: ExponentialLR(opt, gamma=0.9),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
-            optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
+    @skipIfRocm
+    def test_multi_tensor_optimizers(self):
+        if not torch.cuda.is_available():
+            return
 
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.Adam(None, lr=1e-2, weight_decay=-1)
+        optimizer_pairs_with_flags = [
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=True)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=False)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=True)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=False)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=True)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=False)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=True)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=False)),
+            ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True)),
+            ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=True)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=0, centered=True)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=False)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=0, momentum=1, centered=False)),
+            ((optim.Rprop, optim._multi_tensor.Rprop), dict(lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))),
+            ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=0)),
+            ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=1)),
+            ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=0)),
+            ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=1)),
+            ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=0)),
+            ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=1)),
+        ]
+
+        kIterations = 1001
+        device = 'cuda'
+
+        for optimizers, params in optimizer_pairs_with_flags:
+            res = []
+            for opt in optimizers:
+                weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]], 
+                                      dtype=torch.float64, device=device, requires_grad=True)
+                bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True)
+                weight2 = torch.tensor([[-0.0508, -0.3941, -0.2843]], 
+                                       dtype=torch.float64, device=device, requires_grad=True)
+                bias2 = torch.tensor([-0.0711], dtype=torch.float64, device=device, requires_grad=True)
+                input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device).reshape(3, 2)
+
+                model = torch.nn.Sequential(torch.nn.Linear(2, 3), 
+                                            torch.nn.Sigmoid(),
+                                            torch.nn.Linear(3, 1),
+                                            torch.nn.Sigmoid())
+                model.to(torch.float64).to(device)
+
+                pretrained_dict = model.state_dict()
+                pretrained_dict['0.weight'] = weight
+                pretrained_dict['0.bias'] = bias
+                pretrained_dict['2.weight'] = weight2
+                pretrained_dict['2.bias'] = bias2
+                model.load_state_dict(pretrained_dict)
+
+                optimizer = opt(model.parameters(), **params)
+
+                for _ in range(kIterations): 
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = output.sum()
+                    loss.backward()
+
+                    if iter == 0:
+                        model.parameters().__next__().grad = None
+
+                    optimizer.step()
+
+                res.append(model.parameters())
+
+            for p1, p2 in zip(res[0], res[1]):
+                self.assertEqual(p1, p2)
 
-    def test_adamw(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
 
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.AdamW(None, lr=1e-2, weight_decay=-1)
+    def test_adam(self):
+        for optimizer in [optim.Adam, optim_mt.Adam]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, amsgrad=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3),
+                [lambda opt: ExponentialLR(opt, gamma=0.9)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True),
+                [lambda opt: ExponentialLR(opt, gamma=0.9),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, amsgrad=True),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+                optimizer(None, lr=1e-2, betas=(1.0, 0.0))
+
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+                optimizer(None, lr=1e-2, weight_decay=-1)
+
+    def test_adamw(self):
+        for optimizer in [optim.AdamW, optim_mt.AdamW]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1, amsgrad=True)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+                optimizer(None, lr=1e-2, weight_decay=-1)
 
     def test_sparse_adam(self):
         self._test_rosenbrock_sparse(
@@ -365,21 +460,25 @@ def test_sparse_adam(self):
     # ROCm precision is too low to pass this test
     @skipIfRocm
     def test_adadelta(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta([weight, bias])
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95)),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
-            optim.Adadelta(None, lr=1e-2, rho=1.1)
+        for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias])
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95)),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
+                optimizer(None, lr=1e-2, rho=1.1)
 
     def test_adagrad(self):
         self._test_basic_cases(
@@ -421,52 +520,84 @@ def test_adagrad_sparse(self):
         )
 
     def test_adamax(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
-            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
+        for optimizer in [optim.Adamax, optim_mt.Adamax]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+                optimizer(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_rmsprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop(
-                self._build_params_dict(weight, bias, lr=1e-3),
-                lr=1e-2)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
-            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
+        for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True, momentum=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, momentum=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, momentum=0.1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
+                optimizer(None, lr=1e-2, momentum=-1.0)
 
     def test_asgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, t0=100)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
-            optim.ASGD(None, lr=1e-2, weight_decay=-0.5)
+        for optimizer in [optim.ASGD, optim_mt.ASGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, t0=100)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, t0=100)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
+                optimizer(None, lr=1e-2, weight_decay=-0.5)
 
     def test_rprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
-            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
+        for optimizer in [optim.Rprop, optim_mt.Rprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
+                optimizer(None, lr=1e-2, etas=(1.0, 0.5))
 
     def test_lbfgs(self):
         self._test_basic_cases(
diff --git a/test/test_package.py b/test/test_package.py
new file mode 100644
index 000000000000..37d7b0f385a2
--- /dev/null
+++ b/test/test_package.py
@@ -0,0 +1,315 @@
+from unittest import main, skipIf
+from torch.testing._internal.common_utils import TestCase, IS_WINDOWS
+from tempfile import NamedTemporaryFile
+from torch.package import PackageExporter, PackageImporter
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import torch
+from sys import version_info
+from io import StringIO
+
+try:
+    from torchvision.models import resnet18
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
+
+
+
+packaging_directory = Path(__file__).parent
+
+class PackagingTest(TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._temporary_files = []
+
+    def temp(self):
+        t = NamedTemporaryFile()
+        name = t.name
+        if IS_WINDOWS:
+            t.close()  # can't read an open file in windows
+        else:
+            self._temporary_files.append(t)
+        return name
+
+    def tearDown(self):
+        for t in self._temporary_files:
+            t.close()
+        self._temporary_files = []
+
+    def test_saving_source(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_source_file('foo', str(packaging_directory / 'module_a.py'))
+            he.save_source_file('foodir', str(packaging_directory / 'package_a'))
+        hi = PackageImporter(filename)
+        foo = hi.import_module('foo')
+        s = hi.import_module('foodir.subpackage')
+        self.assertEqual(foo.result, 'module_a')
+        self.assertEqual(s.result, 'package_a.subpackage')
+
+    def test_saving_string(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            src = """\
+import math
+the_math = math
+"""
+            he.save_source_string('my_mod', src)
+        hi = PackageImporter(filename)
+        m = hi.import_module('math')
+        import math
+        self.assertIs(m, math)
+        my_mod = hi.import_module('my_mod')
+        self.assertIs(my_mod.math, math)
+
+    def test_save_module(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            import module_a
+            import package_a
+            he.save_module(module_a.__name__)
+            he.save_module(package_a.__name__)
+        hi = PackageImporter(filename)
+        module_a_i = hi.import_module('module_a')
+        self.assertEqual(module_a_i.result, 'module_a')
+        self.assertIsNot(module_a, module_a_i)
+        package_a_i = hi.import_module('package_a')
+        self.assertEqual(package_a_i.result, 'package_a')
+        self.assertIsNot(package_a_i, package_a)
+
+    def test_pickle(self):
+        import package_a.subpackage
+        obj = package_a.subpackage.PackageASubpackageObject()
+        obj2 = package_a.PackageAObject(obj)
+
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_pickle('obj', 'obj.pkl', obj2)
+        hi = PackageImporter(filename)
+
+        # check we got dependencies
+        sp = hi.import_module('package_a.subpackage')
+        # check we didn't get other stuff
+        with self.assertRaises(ImportError):
+            hi.import_module('module_a')
+
+        obj_loaded = hi.load_pickle('obj', 'obj.pkl')
+        self.assertIsNot(obj2, obj_loaded)
+        self.assertIsInstance(obj_loaded.obj, sp.PackageASubpackageObject)
+        self.assertIsNot(package_a.subpackage.PackageASubpackageObject, sp.PackageASubpackageObject)
+
+    def test_resources(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_text('main', 'main', "my string")
+            he.save_binary('main', 'main_binary', "my string".encode('utf-8'))
+            src = """\
+import resources
+t = resources.load_text('main', 'main')
+b = resources.load_binary('main', 'main_binary')
+"""
+            he.save_source_string('main', src, is_package=True)
+        hi = PackageImporter(filename)
+        m = hi.import_module('main')
+        self.assertEqual(m.t, "my string")
+        self.assertEqual(m.b, "my string".encode('utf-8'))
+
+    def test_extern(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.extern_modules(['package_a.subpackage', 'module_a'])
+            he.save_module('package_a')
+        hi = PackageImporter(filename)
+        import package_a.subpackage
+        import module_a
+
+        module_a_im = hi.import_module('module_a')
+        hi.import_module('package_a.subpackage')
+        package_a_im = hi.import_module('package_a')
+
+        self.assertIs(module_a, module_a_im)
+        self.assertIsNot(package_a, package_a_im)
+        self.assertIs(package_a.subpackage, package_a_im.subpackage)
+
+    @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature')
+    def test_mock(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.mock_modules(['package_a.subpackage', 'module_a'])
+            he.save_module('package_a')
+        hi = PackageImporter(filename)
+        import package_a.subpackage
+        _ = package_a.subpackage
+        import module_a
+        _ = module_a
+
+        m = hi.import_module('package_a.subpackage')
+        r = m.result
+        with self.assertRaisesRegex(NotImplementedError, 'was mocked out'):
+            r()
+
+    @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature')
+    def test_custom_requires(self):
+        filename = self.temp()
+
+        class Custom(PackageExporter):
+            def require_module(self, name, dependencies):
+                if name == 'module_a':
+                    self.mock_module('module_a')
+                elif name == 'package_a':
+                    self.save_source_string('package_a', 'import module_a\nresult = 5\n')
+                else:
+                    raise NotImplementedError('wat')
+
+        with Custom(filename, verbose=False) as he:
+            he.save_source_string('main', 'import package_a\n')
+
+        hi = PackageImporter(filename)
+        hi.import_module('module_a').should_be_mocked
+        bar = hi.import_module('package_a')
+        self.assertEqual(bar.result, 5)
+
+    @skipIfNoTorchVision
+    def test_resnet(self):
+        resnet = resnet18()
+
+        f1 = self.temp()
+
+        # create a package that will save it along with its code
+        with PackageExporter(f1, verbose=False) as e:
+            # put the pickled resnet in the package, by default
+            # this will also save all the code files references by
+            # the objects in the pickle
+            e.save_pickle('model', 'model.pkl', resnet)
+
+            # check th debug graph has something reasonable:
+            buf = StringIO()
+            e._write_dep_graph(failing_module='torch', output_file=buf)
+            self.assertIn('torchvision.models.resnet', buf.getvalue())
+
+        # we can now load the saved model
+        i = PackageImporter(f1)
+        r2 = i.load_pickle('model', 'model.pkl')
+
+        # test that it works
+        input = torch.rand(1, 3, 224, 224)
+        ref = resnet(input)
+        self.assertTrue(torch.allclose(r2(input), ref))
+
+        # functions exist also to get at the private modules in each package
+        torchvision = i.import_module('torchvision')
+
+        f2 = self.temp()
+        # if we are doing transfer learning we might want to re-save
+        # things that were loaded from a package
+        with PackageExporter(f2, verbose=False) as e:
+            # We need to tell the exporter about any modules that
+            # came from imported packages so that it can resolve
+            # class names like torchvision.models.resnet.ResNet
+            # to their source code.
+
+            e.importers.insert(0, i.import_module)
+
+            # e.importers is a list of module importing functions
+            # that by default contains importlib.import_module.
+            # it is searched in order until the first success and
+            # that module is taken to be what torchvision.models.resnet
+            # should be in this code package. In the case of name collisions,
+            # such as trying to save a ResNet from two different packages,
+            # we take the first thing found in the path, so only ResNet objects from
+            # one importer will work. This avoids a bunch of name mangling in
+            # the source code. If you need to actually mix ResNet objects,
+            # we suggest reconstructing the model objects using code from a single package
+            # using functions like save_state_dict and load_state_dict to transfer state
+            # to the correct code objects.
+            e.save_pickle('model', 'model.pkl', r2)
+
+        i2 = PackageImporter(f2)
+        r3 = i2.load_pickle('model', 'model.pkl')
+        self.assertTrue(torch.allclose(r3(input), ref))
+
+        # test we can load from a directory
+        import zipfile
+        zf = zipfile.ZipFile(f1, 'r')
+
+        with TemporaryDirectory() as td:
+            zf.extractall(path=td)
+            iz = PackageImporter(str(Path(td) / Path(f1).name))
+            r4 = iz.load_pickle('model', 'model.pkl')
+            self.assertTrue(torch.allclose(r4(input), ref))
+
+    @skipIfNoTorchVision
+    def test_model_save(self):
+
+        # This example shows how you might package a model
+        # so that the creator of the model has flexibility about
+        # how they want to save it but the 'server' can always
+        # use the same API to load the package.
+
+        # The convension is for each model to provide a
+        # 'model' package with a 'load' function that actual
+        # reads the model out of the archive.
+
+        # How the load function is implemented is up to the
+        # the packager.
+
+        # get our normal torchvision resnet
+        resnet = resnet18()
+
+
+        f1 = self.temp()
+        # Option 1: save by pickling the whole model
+        # + single-line, similar to torch.jit.save
+        # - more difficult to edit the code after the model is created
+        with PackageExporter(f1, verbose=False) as e:
+            e.save_pickle('model', 'pickled', resnet)
+            # note that this source is the same for all models in this approach
+            # so it can be made part of an API that just takes the model and
+            # packages it with this source.
+            src = """\
+import resources # gives you access to the importer from within the package
+
+# server knows to call model.load() to get the model,
+# maybe in the future it passes options as arguments by convension
+def load():
+    return resources.load_pickle('model', 'pickled')
+        """
+            e.save_source_string('model', src, is_package=True)
+
+        f2 = self.temp()
+        # Option 2: save with state dict
+        # - more code to write to save/load the model
+        # + but this code can be edited later to adjust adapt the model later
+        with PackageExporter(f2, verbose=False) as e:
+            e.save_pickle('model', 'state_dict', resnet.state_dict())
+            src = """\
+import resources # gives you access to the importer from within the package
+from torchvision.models.resnet import resnet18
+def load():
+    # if you want, you can later edit how resnet is constructed here
+    # to edit the model in the package, while still loading the original
+    # state dict weights
+    r = resnet18()
+    state_dict = resources.load_pickle('model', 'state_dict')
+    r.load_state_dict(state_dict)
+    return r
+        """
+            e.save_source_string('model', src, is_package=True)
+
+
+
+        # regardless of how we chose to package, we can now use the model in a server in the same way
+        input = torch.rand(1, 3, 224, 224)
+        results = []
+        for m in [f1, f2]:
+            importer = PackageImporter(m)
+            the_model = importer.import_module('model').load()
+            r = the_model(input)
+            results.append(r)
+
+        self.assertTrue(torch.allclose(*results))
+
+if __name__ == '__main__':
+    main()
diff --git a/test/test_profiler.py b/test/test_profiler.py
index aefdfbb937fa..f1feff1d0af3 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -3,6 +3,7 @@
 import unittest
 
 import torch
+import torch.nn as nn
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS)
 from torch.autograd.profiler import profile
@@ -18,7 +19,7 @@
 @unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
 @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
-class TestProfiler_cuda(TestCase):
+class TestProfilerCUDA(TestCase):
     def test_mem_leak(self):
         """Checks that there's no memory leak when using profiler with CUDA
         """
@@ -44,5 +45,60 @@ def test_mem_leak(self):
         self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
                         msg='memory usage is increasing, {}'.format(str(last_rss)))
 
+class TestProfiler(TestCase):
+    def test_source(self):
+        """Checks that source code attribution works for eager, TS and autograd mode
+        """
+        # avoid automatic inlining
+        prev_opt = torch._C._get_graph_executor_optimize()
+        torch._C._set_graph_executor_optimize(False)
+
+        @torch.jit.script
+        def ts_method_2(x, y):
+            return torch.matmul(x, y)
+
+        @torch.jit.script
+        def ts_method_1(x, y, z):
+            a = x + z
+            w = ts_method_2(x, y) + a
+            return w.sum()
+
+        class DummyModule(nn.Module):
+            def __init__(self):
+                super(DummyModule, self).__init__()
+                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        mod = DummyModule()
+
+        with profile(with_stack=True) as p:
+            x = torch.randn(10, 10, requires_grad=True)
+            y = torch.randn(10, 10, requires_grad=True)
+            z = x + y
+            w = ts_method_1(x, y, z)
+            v = 2 * w
+            v.backward()
+            a = torch.randn(2, 3, 2, 2, requires_grad=True)
+            b = mod(a)
+            c = b.sum()
+            c.backward()
+
+        print(p.key_averages(
+            group_by_stack_n=5).table(
+            sort_by="self_cpu_time_total", row_limit=-1))
+
+        for e in p.function_events:
+            if "aten::add" in e.name or "AddBackward" in e.name:
+                self.assertTrue(any(["test_profiler" in entry for entry in e.stack]))
+                self.assertTrue(any([(
+                    "test_source" in entry or
+                    "ts_method_1" in entry or
+                    "ts_method_2" in entry) for entry in e.stack]))
+
+        torch._C._set_graph_executor_optimize(prev_opt)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 6ecfa10c05bf..9425ca2b2a8b 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -6,6 +6,7 @@
 
 import itertools
 import functools
+import operator
 import random
 import unittest
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
@@ -1727,53 +1728,182 @@ def test_narrow(self):
 
         self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3))  # dim > sparseDim + denseDim
 
-    def _test_log1p_tensor(self, input, dense_tensor):
+    def _test_log1p_tensor(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
         expected_output = dense_tensor.log1p()
-        self.assertEqual(expected_output, input.log1p().to_dense())
-        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
 
-        # test in-place op on uncoalesced input
-        with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"):
-            input.log1p_()
+        self.assertEqual(expected_output, sparse_tensor.log1p().to_dense())
+        self.assertEqual(expected_output, sparse_tensor.coalesce().log1p_().to_dense())
 
-        input.requires_grad_()
-        self.assertTrue(input.requires_grad)
+        if self.is_uncoalesced:
+            # test in-place op on uncoalesced input
+            with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"):
+                sparse_tensor.log1p_()
+
+        sparse_tensor.requires_grad_()
+        self.assertTrue(sparse_tensor.requires_grad)
 
         # test autograd
-        x = input.clone()
-        y = input.log1p()
+        x = sparse_tensor.clone()
+        y = sparse_tensor.log1p()
         with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
             y.backward(x)
 
     def test_log1p(self):
-        input = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2]]).transpose(1, 0).clone().detach(),
-            torch.FloatTensor([3, 4, 5]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.as_tensor([3, 4, 5], dtype=torch.float32))
-
-        # test uncoalesced input
-        input_uncoalesced = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0).clone().detach(),
-            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input_uncoalesced, torch.as_tensor([3, 4, 5], dtype=torch.float32))
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([2, 0]),
-            torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
-            torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0]))
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([1, 5]),
-            torch.zeros([5, 6, 0]),
-            torch.Size([5, 6, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([5, 6, 0]))
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([3.0, 4.0, 5.0]),
+                size=[3, ],
+                device=self.device
+            ).coalesce()
+            self._test_log1p_tensor(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[1.0, 3.0], [5.0, 7.0]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_log1p_tensor(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([2.0, 3.0, 4.0, 1.0, 1.0, 1.0]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_log1p_tensor(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_log1p_tensor(input_uncoalesced)
+
+    def _test_neg_negative(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
+        expected_output = dense_tensor.neg()
+
+        ops = (
+            torch.neg, torch.Tensor.neg, torch.Tensor.neg_,
+            torch.negative, torch.Tensor.negative, torch.Tensor.negative_,
+            operator.neg
+        )
+        for op in ops:
+            sparse_tensor_copy = sparse_tensor.clone()
+            self.assertEqual(expected_output, op(sparse_tensor_copy).to_dense())
+
+            if op in (torch.neg, torch.negative):
+                sparse_tensor_out = torch.zeros_like(sparse_tensor)
+                op(sparse_tensor, out=sparse_tensor_out)
+                self.assertEqual(expected_output, sparse_tensor_out.to_dense())
+
+    def test_neg_negative(self):
+
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1, 2]]),
+                values=torch.tensor([3.0, -4.0, 5.0]),
+                size=[3, ],
+                device=self.device
+            ).coalesce()
+            self._test_neg_negative(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[-1.0, 3.0], [-5.0, 7.0]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_neg_negative(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([2.0, -3.0, -4.0, 1.0, -1.0, 1.5]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_neg_negative(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_neg_negative(input_uncoalesced)
+
+    def _test_asin_arcsin(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
+        expected_output = dense_tensor.asin()
+
+        ops = (
+            torch.asin, torch.Tensor.asin,
+            torch.arcsin, torch.Tensor.arcsin,
+        )
+        for op in ops:
+            self.assertEqual(expected_output, op(sparse_tensor).to_dense())
+            if op in (torch.asin, torch.arcsin):
+                sparse_tensor_out = torch.zeros_like(sparse_tensor)
+                op(sparse_tensor, out=sparse_tensor_out)
+                self.assertEqual(expected_output, sparse_tensor_out.to_dense())
+
+        for op in (torch.Tensor.asin_, torch.Tensor.arcsin_):
+            self.assertEqual(expected_output, op(sparse_tensor.clone().coalesce()).to_dense())
+            if self.is_uncoalesced:
+                # test in-place op on uncoalesced input
+                with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"):
+                    op(sparse_tensor)
+
+    def test_asin_arcsin(self):
+
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1, 2, 3]]),
+                values=torch.tensor([0.5, -0.5, 0.7, -0.7]),
+                size=[4, ],
+                device=self.device
+            ).coalesce()
+            self._test_asin_arcsin(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[-0.1, 0.24], [-0.44, 0.1]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_asin_arcsin(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([0.3, -0.3, -0.4, 0.3, -0.5, 0.15]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_asin_arcsin(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_asin_arcsin(input_uncoalesced)
 
     def test_mv(self):
         def test_shape(di, dj, dk, nnz):
@@ -2458,7 +2588,7 @@ def test_sparse_to_numpy(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4]))
         self.assertRaises(TypeError, lambda: t.numpy())
 
-    @cpu_only
+    @skipIfRocm
     def test_softmax(self):
         import torch.nn.functional as F
 
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 59b58fa202d6..82ed2225bda8 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -3,6 +3,7 @@
 import math
 from contextlib import contextmanager
 from itertools import product
+import itertools
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA)
@@ -11,7 +12,7 @@
      skipCPUIfNoMkl, skipCUDAIfRocm, deviceCountAtLeast, onlyCUDA)
 
 from distutils.version import LooseVersion
-from typing import Optional
+from typing import Optional, List
 
 
 if TEST_NUMPY:
@@ -115,6 +116,7 @@ def method_fn(t):
 
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
     @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
     @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4})
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
@@ -226,11 +228,13 @@ def test_fft_round_trip(self, device, dtype):
     def test_empty_fft(self, device, dtype):
         t = torch.empty(0, device=device, dtype=dtype)
         match = r"Invalid number of data points \([-\d]*\) specified"
-        fft_functions = [torch.fft.fft, torch.fft.ifft, torch.fft.hfft,
-                         torch.fft.irfft]
+        fft_functions = [torch.fft.fft, torch.fft.fftn,
+                         torch.fft.ifft, torch.fft.ifftn,
+                         torch.fft.irfft, torch.fft.irfftn,
+                         torch.fft.hfft]
         # Real-only functions
         if not dtype.is_complex:
-            fft_functions += [torch.fft.rfft, torch.fft.ihfft]
+            fft_functions += [torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft]
 
         for fn in fft_functions:
             with self.assertRaisesRegex(RuntimeError, match):
@@ -242,6 +246,9 @@ def test_fft_invalid_dtypes(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
             torch.fft.rfft(t)
 
+        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+            torch.fft.rfftn(t)
+
         with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
             torch.fft.ihfft(t)
 
@@ -292,7 +299,9 @@ def test_fft_half_errors(self, device, dtype):
         # TODO: Remove torch.half error when complex32 is fully implemented
         x = torch.randn(64, device=device).to(dtype)
         fft_functions = (torch.fft.fft, torch.fft.ifft,
+                         torch.fft.fftn, torch.fft.ifftn,
                          torch.fft.rfft, torch.fft.irfft,
+                         torch.fft.rfftn, torch.fft.irfftn,
                          torch.fft.hfft, torch.fft.ihfft)
         for fn in fft_functions:
             with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "):
@@ -300,6 +309,7 @@ def test_fft_half_errors(self, device, dtype):
 
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
     @dtypes(torch.double, torch.complex128)  # gradcheck requires double
     def test_fft_backward(self, device, dtype):
         test_args = list(product(
@@ -340,6 +350,241 @@ def test_fn(x):
 
                 self.assertTrue(torch.autograd.gradcheck(test_fn, (input,)))
 
+    # nd-fft tests
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4})
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftn_numpy(self, device, dtype):
+        norm_modes = ((None, "forward", "backward", "ortho")
+                      if LooseVersion(np.__version__) >= '1.20.0'
+                      else (None, "ortho"))
+
+        # input_ndim, s, dim
+        transform_desc = [
+            *product(range(2, 5), (None,), (None, (0,), (0, -1))),
+            *product(range(2, 5), (None, (4, 10)), (None,)),
+            (6, None, None),
+            (5, None, (1, 3, 4)),
+            (3, None, (0, -1)),
+            (3, None, (1,)),
+            (1, None, (0,)),
+            (4, (10, 10), None),
+            (4, (10, 10), (0, 1))
+        ]
+
+        fft_functions = ['fftn', 'ifftn', 'irfftn']
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += ['rfftn']
+
+        for input_ndim, s, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            input = torch.randn(*shape, device=device, dtype=dtype)
+            for fname, norm in product(fft_functions, norm_modes):
+                torch_fn = getattr(torch.fft, fname)
+                numpy_fn = getattr(np.fft, fname)
+
+                def fn(t: torch.Tensor, s: Optional[List[int]], dim: Optional[List[int]], norm: Optional[str]):
+                    return torch_fn(t, s, dim, norm)
+
+                torch_fns = (torch_fn, torch.jit.script(fn))
+
+                expected = numpy_fn(input.cpu().numpy(), s, dim, norm)
+                exact_dtype = dtype in (torch.double, torch.complex128)
+                for fn in torch_fns:
+                    actual = fn(input, s, dim, norm)
+                    self.assertEqual(actual, expected, exact_dtype=exact_dtype)
+
+    @skipCUDAIfRocm
+    @skipCPUIfNoMkl
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftn_round_trip(self, device, dtype):
+        norm_modes = (None, "forward", "backward", "ortho")
+
+        # input_ndim, dim
+        transform_desc = [
+            *product(range(2, 5), (None, (0,), (0, -1))),
+            *product(range(2, 5), (None,)),
+            (7, None),
+            (5, (1, 3, 4)),
+            (3, (0, -1)),
+            (3, (1,)),
+            (1, 0),
+        ]
+
+        fft_functions = [(torch.fft.fftn, torch.fft.ifftn)]
+
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += [(torch.fft.rfftn, torch.fft.irfftn)]
+
+        for input_ndim, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            x = torch.randn(*shape, device=device, dtype=dtype)
+
+            for (forward, backward), norm in product(fft_functions, norm_modes):
+                if isinstance(dim, tuple):
+                    s = [x.size(d) for d in dim]
+                else:
+                    s = x.size() if dim is None else x.size(dim)
+
+                kwargs = {'s': s, 'dim': dim, 'norm': norm}
+                y = backward(forward(x, **kwargs), **kwargs)
+                # For real input, ifftn(fftn(x)) will convert to complex
+                self.assertEqual(x, y, exact_dtype=(
+                    forward != torch.fft.fftn or x.is_complex()))
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.double, torch.complex128)  # gradcheck requires double
+    def test_fftn_backward(self, device, dtype):
+        # input_ndim, s, dim
+        transform_desc = [
+            *product((2, 3), (None,), (None, (0,), (0, -1))),
+            *product((2, 3), (None, (4, 10)), (None,)),
+            (4, None, None),
+            (3, (10, 10), (0, 1)),
+            (2, (1, 1), (0, 1)),
+            (2, None, (1,)),
+            (1, None, (0,)),
+            (1, (11,), (0,)),
+        ]
+        norm_modes = (None, "forward", "backward", "ortho")
+
+        fft_functions = ['fftn', 'ifftn', 'irfftn']
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += ['rfftn']
+
+        for input_ndim, s, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            input = torch.randn(*shape, device=device, dtype=dtype)
+
+            for fname, norm in product(fft_functions, norm_modes):
+                torch_fn = getattr(torch.fft, fname)
+
+                # Workaround for gradcheck's poor support for complex input
+                # Use real input instead and put view_as_complex into the graph
+                if dtype.is_complex:
+                    def test_fn(x):
+                        return torch_fn(torch.view_as_complex(x), s, dim, norm)
+                    inputs = (torch.view_as_real(input).detach().requires_grad_(),)
+                else:
+                    def test_fn(x):
+                        return torch_fn(x, s, dim, norm)
+                    inputs = (input.detach().requires_grad_(),)
+
+                self.assertTrue(torch.autograd.gradcheck(test_fn, inputs))
+
+    @skipCUDAIfRocm
+    @skipCPUIfNoMkl
+    @onlyOnCPUAndCUDA
+    def test_fftn_invalid(self, device):
+        a = torch.rand(10, 10, 10, device=device)
+        fft_funcs = (torch.fft.fftn, torch.fft.ifftn,
+                     torch.fft.rfftn, torch.fft.irfftn)
+
+        for func in fft_funcs:
+            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+                func(a, dim=(0, 1, 0))
+
+            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+                func(a, dim=(2, -1))
+
+            with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
+                func(a, s=(1,), dim=(0, 1))
+
+            with self.assertRaisesRegex(IndexError, "Dimension out of range"):
+                func(a, dim=(3,))
+
+            with self.assertRaisesRegex(RuntimeError, "tensor only has 3 dimensions"):
+                func(a, s=(10, 10, 10, 10))
+
+        c = torch.complex(a, a)
+        with self.assertRaisesRegex(RuntimeError, "Expected a real input"):
+            torch.fft.rfftn(c)
+
+    # Helper functions
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftfreq_numpy(self, device, dtype):
+        test_args = [
+            *product(
+                # n
+                range(1, 20),
+                # d
+                (None, 10.0),
+            )
+        ]
+
+        functions = ['fftfreq', 'rfftfreq']
+
+        for fname in functions:
+            torch_fn = getattr(torch.fft, fname)
+            numpy_fn = getattr(np.fft, fname)
+
+            for n, d in test_args:
+                args = (n,) if d is None else (n, d)
+                expected = numpy_fn(*args)
+                actual = torch_fn(*args, device=device, dtype=dtype)
+                self.assertEqual(actual, expected, exact_dtype=False)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftshift_numpy(self, device, dtype):
+        test_args = [
+            # shape, dim
+            *product(((11,), (12,)), (None, 0, -1)),
+            *product(((4, 5), (6, 6)), (None, 0, (-1,))),
+            *product(((1, 1, 4, 6, 7, 2),), (None, (3, 4))),
+        ]
+
+        functions = ['fftshift', 'ifftshift']
+
+        for shape, dim in test_args:
+            input = torch.rand(*shape, device=device, dtype=dtype)
+            input_np = input.cpu().numpy()
+
+            for fname in functions:
+                torch_fn = getattr(torch.fft, fname)
+                numpy_fn = getattr(np.fft, fname)
+
+                expected = numpy_fn(input_np, axes=dim)
+                actual = torch_fn(input, dim=dim)
+                self.assertEqual(actual, expected)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftshift_frequencies(self, device, dtype):
+        for n in range(10, 15):
+            sorted_fft_freqs = torch.arange(-(n // 2), n - (n // 2),
+                                            device=device, dtype=dtype)
+            x = torch.fft.fftfreq(n, d=1 / n, device=device, dtype=dtype)
+
+            # Test fftshift sorts the fftfreq output
+            shifted = torch.fft.fftshift(x)
+            self.assertTrue(torch.allclose(shifted, shifted.sort().values))
+            self.assertEqual(sorted_fft_freqs, shifted)
+
+            # And ifftshift is the inverse
+            self.assertEqual(x, torch.fft.ifftshift(shifted))
+
     # Legacy fft tests
     def _test_fft_ifft_rfft_irfft(self, device, dtype):
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 407ea03acda6..86dafa3903dd 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -106,7 +106,8 @@ def test_multihead_attention_layer(self):
         DROPOUT = 0.1
         device = torch.device("cpu")
         attention = MultiHeadAttentionLayer(HID_DIM, HEADS, DROPOUT, device).to(device)
-        src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device)
+        with torch.no_grad():
+            src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device)
         src_mask = (src > 0)[:, :, 0].unsqueeze(1).unsqueeze(2).to(device)
 
         attention.eval()
@@ -129,8 +130,9 @@ def test_mlp(self):
         bot_l_acc = StaticRuntime(bot_l)
         top_l = create_mlp(ln_top, sigmoid_top)
         top_l_acc = StaticRuntime(top_l)
-        bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
-        top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
+        with torch.no_grad():
+            bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
+            top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
         ref_bot = bot_l(bot_inp)
         acc_bot = bot_l_acc(bot_inp)[0]
         torch.testing.assert_allclose(acc_bot, ref_bot)
@@ -138,8 +140,9 @@ def test_mlp(self):
         acc_top = top_l_acc(top_inp)[0]
         torch.testing.assert_allclose(acc_top, ref_top)
         for _ in range(5):
-            bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
-            top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
+            with torch.no_grad():
+                bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
+                top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
             ref_bot = bot_l(bot_inp)
             acc_bot = bot_l_acc(bot_inp)[0]
             torch.testing.assert_allclose(acc_bot, ref_bot)
@@ -147,13 +150,13 @@ def test_mlp(self):
             acc_top = top_l_acc(top_inp)[0]
             torch.testing.assert_allclose(acc_top, ref_top)
 
-    # def test_trivial_graph(self):
-    #     s = torch.full((2, 2), 2)
-    #     tg = torch.jit.script(trivial_graph)
-    #     o_ref = tg(s, s, s)
-    #     tg_a = StaticRuntime(tg)
-    #     o_test = tg_a(s, s, s)[0]
-    #     torch.testing.assert_allclose(o_ref, o_test)
+    def test_trivial_graph(self):
+        s = torch.full((2, 2), 2)
+        tg = torch.jit.script(trivial_graph)
+        o_ref = tg(s, s, s)
+        tg_a = StaticRuntime(tg)
+        o_test = tg_a(s, s, s)[0]
+        torch.testing.assert_allclose(o_ref, o_test)
 
 
 if __name__ == "__main__":
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index ab6eae83568e..d9e0f59a5210 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -6,7 +6,7 @@
 import torch
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, suppress_warnings,
+    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, TEST_WITH_ROCM, suppress_warnings,
      torch_to_numpy_dtype_dict, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
@@ -1047,7 +1047,9 @@ def test_logspace_special_steps(self, device, dtype):
             self._test_logspace_base2(device, dtype, steps=steps)
 
     @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))
+    @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) 
+                    if TEST_WITH_ROCM 
+                    else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
     def test_logspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 143c6dab91d2..739957569962 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1046,18 +1046,18 @@ def easy(x, y):
         # FIXME: interp.elapsed_value() also increments due to simplifier
         assert llvm.elapsed_value() == 1 or interp.elapsed_value() > 1
 
-    def test_unsqueeze(self):
+    def test_unsqueeze(self, N=256):
         def easy(x, y):
             a = torch.unsqueeze(x, 0)
             b = torch.unsqueeze(y, 0)
             return a + b
 
-        traced = torch.jit.trace(easy, (torch.ones(1024, 1024), torch.zeros(1024, 1024)))
+        traced = torch.jit.trace(easy, (torch.ones(N, N), torch.zeros(N, N)))
 
         llvm = LLVMCodeGenExecuted()
         interp = SimpleIREvalExecuted()
 
-        a = torch.rand(1024, 1024)
+        a = torch.rand(N, N)
         x = traced(a, a)
         npr = np.expand_dims(a, 0)
         npr = npr + npr
diff --git a/test/test_torch.py b/test/test_torch.py
index dd012305ad56..7e4fe38fdc97 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -41,11 +41,10 @@
 from typing import Dict, List, Tuple, Union
 import torch.backends.quantized
 import torch.testing._internal.data
-from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, \
+from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off, \
     _get_torch_cuda_version, TEST_MAGMA
 
 
-
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -59,7 +58,7 @@
 
 SIZE = 100
 
-AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() 
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
 # Wrap base test class into a class to hide it from testing
 # See https://stackoverflow.com/a/25695512
@@ -230,10 +229,6 @@ def test_linear_algebra_scalar_raises(self) -> None:
             s = torch.tensor(7)
             self.assertRaises(RuntimeError, lambda: torch.mv(m, s))
             self.assertRaises(RuntimeError, lambda: torch.addmv(v, m, s))
-            self.assertRaises(RuntimeError, lambda: torch.ger(v, s))
-            self.assertRaises(RuntimeError, lambda: torch.ger(s, v))
-            self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
-            self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))
 
         @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
         def test_mvlgamma(self):
@@ -659,6 +654,13 @@ def test_copy_transpose(self):
             self.assertEqual(y[:, 0], range(100))
             self.assertEqual(y[:, 40], range(4000, 4100))
 
+            # Validates regression reported in https://github.com/pytorch/pytorch/issues/45269
+            x = torch.arange(100 * 100).reshape(100, 100).to(dtype=torch.cfloat).t()
+            y = torch.empty(100, 100, dtype=torch.cfloat)
+            y.copy_(x)
+            self.assertEqual(y[:, 0], range(100))
+            self.assertEqual(y[:, 40], range(4000, 4100))
+
         def test_device(self):
             cpu = torch.device('cpu')
             self.assertEqual('cpu', str(cpu))
@@ -4698,6 +4700,22 @@ def add_neg_dim_tests():
 class TestTorchDeviceType(TestCase):
     exact_dtype = True
 
+    @onlyCPU
+    def test_set_deterministic_beta_warning(self, device):
+        det = torch.is_deterministic()
+        try:
+            # Ensures setting to false does not throw a warning
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                torch.set_deterministic(False)
+                self.assertEqual(len(w), 0)
+
+            # Setting set_deterministic(True) throws a warning once per process
+            with self.maybeWarnsRegex(UserWarning, "torch.set_deterministic is in beta"):
+                torch.set_deterministic(True)
+        finally:
+            torch.set_deterministic(det)
+
     # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor
     #   throws the correct error message
     @onlyCUDA
@@ -4789,6 +4807,10 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                 if fn_name == 'abs':
                     torch_inplace_method = getattr(torch.Tensor, fn_name + "_")
                     np_fn(a, out=a)
+                    if dtype.is_complex:
+                        with self.assertRaisesRegex(RuntimeError, "In-place abs is not supported for complex tensors."):
+                            torch_inplace_method(t)
+                        return
                     torch_inplace_method(t)
                     self.assertEqual(torch.from_numpy(a), t.cpu())
 
@@ -6306,60 +6328,38 @@ def test_heaviside_complex(self, device, dtypes):
     def test_logical_not(self, device, dtype):
         data = [10, 1, 0.3, 0, -0.3, -1, -10]
         a = torch.tensor(data, dtype=dtype, device=device)
-
-        # do this before constructing the numpy array because np can't construct
-        # bfloat16 tensors.  Can we define our own dtype in NumPy so testing would be easier?
-        if dtype == torch.bfloat16 or dtype.is_complex:
-            self.assertRaises(RuntimeError, lambda: a.logical_not())
-            self.assertRaises(RuntimeError, lambda: a.logical_not_())
-            raise unittest.SkipTest('logical_not not supported on {}'.format(dtype))
-
-        a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype])
-        self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu'))
-        self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu'))
+        if dtype == torch.bfloat16:  # numpy doesn't support these dtypes
+            result = [False, False, False, True, False, False, False]
+            self.assertEqual(torch.logical_not(a), torch.tensor(result, dtype=torch.bool, device=device))
+        else:
+            a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype])
+            self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu'))
+            self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu'))
 
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
-    @dtypes(*list(product(torch.testing.get_all_dtypes(),
-                          torch.testing.get_all_dtypes())))
+    @dtypes(*product(torch.testing.get_all_dtypes(),
+                     torch.testing.get_all_dtypes()))
     def test_logical_not_out(self, device, dtypes):
         dtype = dtypes[0]
         out_dtype = dtypes[1]
         data = [10, 1, 0.3, 0, -0.3, -1, -10]
         a = torch.tensor(data, dtype=dtype, device=device)
-        out = torch.empty(a.shape, dtype=out_dtype, device=device)
-
-        if (dtype == torch.bfloat16 or dtype.is_complex or
-                out_dtype == torch.bfloat16 or out_dtype.is_complex):
-            self.assertRaises(RuntimeError, lambda: torch.logical_not(a, out=out))
-            raise unittest.SkipTest('logical_not not supported on {}'.format(out_dtype))
-
-        out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype])
-
-        self.assertEqual(a, a.cpu().numpy())
-        torch.logical_not(a, out=out)
-        np.logical_not(a.cpu().numpy(), out=out_np)
-        self.assertEqual(out_np, out.to('cpu'))
+        out = torch.empty_like(a, dtype=out_dtype, device=device)
+        if torch.bfloat16 in dtypes:  # numpy doesn't support these dtypes
+            result = [not i for i in a]
+            self.assertEqual(torch.logical_not(a, out=out), torch.tensor(result, dtype=out_dtype, device=device))
+        else:
+            out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype])
+            self.assertEqual(a, a.cpu().numpy())
+            torch.logical_not(a, out=out)
+            np.logical_not(a.cpu().numpy(), out=out_np)
+            self.assertEqual(out_np, out.to('cpu'))
 
     def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         expected_res = torch.tensor(expected_res_, dtype=dtypes[0], device=device)
         a = torch.tensor(a_, dtype=dtypes[0], device=device)
         b = torch.tensor(b_, dtype=dtypes[1], device=device)
 
-        # Skip bfloat16 on CUDA. Remove this after bfloat16 is supported on CUDA.
-        # After type promotion of bfloat16 is supported, some bfloat16 logical operation will go through on
-        # CUDA as long as the two tensors are promoted to a supported type.
-        # TODO: Remove this once logical operators are improved to take care of bfloat16.
-        if self.device_type == 'cuda' and torch.bfloat16 in dtypes:
-            if torch.promote_types(dtypes[0], dtypes[1]) == torch.bfloat16:
-                with self.assertRaises(RuntimeError):
-                    getattr(a, op)(b)
-                return
-
-        if dtypes[0].is_complex or dtypes[1].is_complex:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op)(b)
-            return
-
         # new tensor
         self.assertEqual(expected_res.bool(), getattr(a, op)(b))
         # out
@@ -6374,18 +6374,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
                 getattr(a, op + '_')(b)
             return
 
-        # TODO: remove when logical ops support bfloat16 on CUDA.
-        if self.device_type == 'cuda' and dtypes[0] == torch.bfloat16:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op + '_')(b)
-            return
-
-        # TODO: remove when complex ops are supported
-        if dtypes[0].is_complex:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op + '_')(b)
-            return
-
         getattr(a, op + '_')(b)
         self.assertEqual(expected_res, a)
 
@@ -7033,6 +7021,9 @@ def test_matrix_exp_boundary_cases(self, device, dtype):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
+    # Although tf32 is always disabled on matrix_exp, this test uses matmul,
+    # which has tf32 on by default
+    @with_tf32_off
     def test_matrix_exp_analytic(self, device, dtype):
         # check zero matrix
         x = torch.zeros(20, 20, dtype=dtype, device=device)
@@ -7174,6 +7165,9 @@ def run_test(*n):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
+    # Although tf32 is always disabled on matrix_exp, this test uses matmul,
+    # which has tf32 on by default
+    @with_tf32_off
     def test_matrix_exp_compare_with_taylor(self, device, dtype):
 
         def normalize_to_1_operator_norm(sample, desired_norm):
@@ -7758,14 +7752,29 @@ def cholesky_test_helper(n, batchsize, device, upper):
         for upper, batchsize in product([True, False], [262144, 524288]):
             cholesky_test_helper(2, batchsize, device, upper)
 
+    @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_cholesky_batched(self, device, dtype):
-        from torch.testing._internal.common_utils import random_symmetric_pd_matrix
+        from torch.testing._internal.common_utils import \
+            (random_symmetric_pd_matrix,
+             random_fullrank_matrix_distinct_singular_value)
 
         def cholesky_test_helper(n, batch_dims, upper):
-            A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device)
+            # This is a workaround while there is no support for complex random_symmetric_pd_matrix
+            if dtype.is_complex:
+                real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+                A_real = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device)
+                A_imag = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device)
+                A = A_real + 1j * A_imag
+                # There is no support for complex batched matmul yet
+                matmul_list = []
+                for mat in A.contiguous().view(-1, n, n):
+                    matmul_list.append(mat @ mat.t().conj())
+                A = torch.stack(matmul_list).view(*batch_dims, n, n)
+            else:
+                A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device)
             cholesky_exp = torch.stack([m.cholesky(upper=upper) for m in A.reshape(-1, n, n)])
             cholesky_exp = cholesky_exp.reshape_as(A)
             self.assertEqual(cholesky_exp, torch.cholesky(A, upper=upper))
@@ -7773,26 +7782,38 @@ def cholesky_test_helper(n, batch_dims, upper):
         for upper, batchsize in product([True, False], [(3,), (3, 4), (2, 3, 4)]):
             cholesky_test_helper(3, batchsize, upper)
 
+    @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_cholesky(self, device, dtype):
-        x = torch.rand(10, 10, dtype=dtype, device=device) + 1e-1
-        A = torch.mm(x, x.t())
+        from torch.testing._internal.common_utils import \
+            (random_symmetric_pd_matrix,
+             random_fullrank_matrix_distinct_singular_value)
+
+        # This is a workaround while there is no support for complex random_symmetric_pd_matrix
+        if dtype.is_complex:
+            real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+            A_real = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device)
+            A_imag = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device)
+            A = A_real + 1j * A_imag
+            A = A @ A.t().conj()
+        else:
+            A = random_symmetric_pd_matrix(10, dtype=dtype, device=device)
 
         # default Case
         C = torch.cholesky(A)
-        B = torch.mm(C, C.t())
+        B = torch.mm(C, C.t().conj())
         self.assertEqual(A, B, atol=1e-14, rtol=0)
 
         # test Upper Triangular
         U = torch.cholesky(A, True)
-        B = torch.mm(U.t(), U)
+        B = torch.mm(U.t().conj(), U)
         self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (upper) did not allow rebuilding the original matrix')
 
         # test Lower Triangular
         L = torch.cholesky(A, False)
-        B = torch.mm(L, L.t())
+        B = torch.mm(L, L.t().conj())
         self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix')
 
     def test_view(self, device):
@@ -9536,20 +9557,26 @@ def test_rpow(self, device):
         assert m.dim() == 0, "m is intentionally a scalar"
         self.assertEqual(torch.pow(2, m), 2**m)
 
+    @precisionOverride({torch.float32: 1e-5, torch.complex64: 1e-5})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypesIfCPU(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @dtypesIfCUDA(torch.float32, torch.float64)
     def test_symeig(self, device, dtype):
-        from torch.testing._internal.common_utils import random_symmetric_matrix
+        from torch.testing._internal.common_utils import random_hermitian_matrix
 
         def run_test(dims, eigenvectors, upper):
-            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
-            oute = torch.empty(dims[1:] + dims[:1], dtype=dtype, device=device)
+            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
+            if dtype.is_complex:
+                real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+            else:
+                real_dtype = dtype
+            oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device)
             outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
             torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
 
             if eigenvectors:
-                x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute)), outv.transpose(-2, -1))
+                x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj())
                 self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
             else:
                 eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
@@ -9561,14 +9588,14 @@ def run_test(dims, eigenvectors, upper):
             self.assertEqual(resv, outv, msg="outputs of symeig and symeig with out don't match")
 
             # test non-contiguous
-            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
+            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
             n_dim = len(dims) + 1
             # Reverse the batch dimensions and the matrix dimensions and then concat them
             x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2))
             assert not x.is_contiguous(), "x is intentionally non-contiguous"
             rese, resv = torch.symeig(x, eigenvectors=eigenvectors, upper=upper)
             if eigenvectors:
-                x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese)), resv.transpose(-2, -1))
+                x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese.to(dtype))), resv.transpose(-2, -1).conj())
                 self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
             else:
                 eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
@@ -9579,6 +9606,25 @@ def run_test(dims, eigenvectors, upper):
         for batch_dims, eigenvectors, upper in product(batch_dims_set, (True, False), (True, False)):
             run_test((5,) + batch_dims, eigenvectors, upper)
 
+    # TODO: once there is more support for complex dtypes on GPU, they shall be added to above test
+    # particularly when RuntimeError: _th_bmm_out not supported on CUDAType for ComplexFloat is fixed
+    @unittest.expectedFailure
+    @onlyCUDA
+    @skipCUDAIfNoMagma
+    @dtypes(torch.complex64, torch.complex128)
+    def test_symeig_complex_xfailed(self, device, dtype):
+        from torch.testing._internal.common_utils import random_hermitian_matrix
+
+        dims = (5, 3)
+        x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
+        real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+        oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device)
+        outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
+        torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
+
+        x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj())
+        self.assertEqual(x, x_recon, atol=1e-8, rtol=0)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
@@ -10784,15 +10830,6 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(1, len(z))
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
-    @onlyOnCPUAndCUDA
-    def test_nonzero_deprecated(self, device):
-        x = torch.randn((2, 3), device=device)
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            x.nonzero()
-
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            torch.nonzero(x)
-
     # TODO: add torch.complex64, torch.complex128
     @dtypes(torch.float, torch.double)
     def test_normal(self, device, dtype):
@@ -11291,6 +11328,19 @@ def test_signbit_complex(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, 'signbit is not implemented for complex tensors.'):
             torch.signbit(t, out=out)
 
+    @dtypes(torch.cfloat, torch.cdouble)
+    def test_sgn(self, device, dtype):
+        x = torch.randn(100, dtype=dtype)
+        angle = x.angle()
+        out = x.sgn()
+        self.assertEqual(out.angle(), angle)
+        self.assertEqual(out.abs(), torch.ones_like(x).real)
+
+        x_out = torch.empty_like(x)
+        torch.sgn(x, out=x_out)
+        self.assertEqual(x_out.angle(), angle)
+        self.assertEqual(x_out.abs(), torch.ones_like(x).real)
+
     @dtypes(*(torch.testing.get_all_dtypes(include_bool=False)))
     def test_signbit_non_boolean_output(self, device, dtype):
         # test non-boolean tensors as the `out=` parameters
@@ -11648,6 +11698,24 @@ def test_add(self, device):
         m2 = torch.tensor([3., 4.], dtype=torch.bfloat16)
         self.assertEqual(m1 + m2, torch.tensor([4., 6.], dtype=torch.bfloat16))
 
+        # different alpha types
+        m1 = torch.tensor([2 + 3j, 4 + 5j], dtype=torch.complex64, device=device)
+        m2 = torch.tensor([4 + 5j, 2 + 3j], dtype=torch.complex64, device=device)
+        # add complex numbers with float alpha
+        res = torch.add(m1, m2, alpha=0.1)
+        expected = torch.tensor([2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
+        # add complex numbers with complex alpha
+        res = torch.add(m1, m2, alpha=complex(0.1, 0.2))
+        expected = torch.tensor([1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
+        # add complex numbers with integer alpha
+        res = torch.add(m1, m2, alpha=2)
+        expected = torch.tensor([10. + 13.j, 8. + 11.j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
         # mismatched alpha
         m1 = torch.tensor([1], dtype=torch.int8, device=device)
         m2 = torch.tensor([2], dtype=torch.int8, device=device)
@@ -11658,6 +11726,15 @@ def test_add(self, device):
                                r"For integral input tensors, argument alpha must not be a floating point number\.",
                                lambda: torch.add(m1, m2, alpha=1.0))
 
+        # mismatched alpha, float / double tensor and complex alpha
+        m1 = torch.tensor([3., 4.], device=device)
+        m2 = torch.tensor([4., 3.], device=device)
+        self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+
+        m1 = torch.tensor([3., 4.], dtype=torch.double, device=device)
+        m2 = torch.tensor([4., 3.], dtype=torch.double, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+
         # complex
         m1 = torch.tensor((4.0000 + 4.0000j), dtype=torch.complex64)
         m2 = torch.tensor(4., dtype=torch.float64)
@@ -12660,7 +12737,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
             input.scatter_(0, index, src, reduce=operation)
             self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}")
 
-    @skipCUDAIfRocm            
+    @skipCUDAIfRocm
     @onlyOnCPUAndCUDA
     @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() +
                     torch.testing.get_all_int_dtypes()))
@@ -13019,10 +13096,6 @@ def gen_nontrivial_input(shape, dtype, device):
             dst2 = tensor.nonzero(as_tuple=False)
             dst3 = torch.empty([], dtype=torch.long, device=device)
             torch.nonzero(tensor, out=dst3)
-            self.assertRaisesRegex(
-                TypeError,
-                "received an invalid combination of arguments",
-                lambda: torch.nonzero(tensor, as_tuple=True, out=dst3))
             if self.device_type != 'xla':
                 # xla does not raise runtime error
                 self.assertRaisesRegex(
@@ -13048,6 +13121,37 @@ def gen_nontrivial_input(shape, dtype, device):
             self.assertEqual(tup1, np_result, atol=0, rtol=0)
             self.assertEqual(tup2, np_result, atol=0, rtol=0)
 
+    def test_nonzero_astuple_out(self, device):
+        t = torch.randn((3, 3, 3), device=device)
+        out = torch.empty_like(t, dtype=torch.long)
+
+        with self.assertRaises(RuntimeError):
+            torch.nonzero(t, as_tuple=True, out=out)
+
+        self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out))
+
+        # Verifies that JIT script cannot handle the as_tuple kwarg
+        # See Issue https://github.com/pytorch/pytorch/issues/45499.
+        def _foo(t):
+            tuple_result = torch.nonzero(t, as_tuple=True)
+            nontuple_result = torch.nonzero(t, as_tuple=False)
+            out = torch.empty_like(nontuple_result)
+            torch.nonzero(t, as_tuple=False, out=out)
+            return tuple_result, nontuple_result, out
+
+        with self.assertRaises(RuntimeError):
+            scripted_foo = torch.jit.script(_foo)
+
+        # Verifies that JIT tracing works fine
+        traced_foo = torch.jit.trace(_foo, t)
+        traced_tuple, traced_nontuple, traced_out = traced_foo(t)
+        expected_tuple = torch.nonzero(t, as_tuple=True)
+        expected_nontuple = torch.nonzero(t)
+
+        self.assertEqual(traced_tuple, expected_tuple)
+        self.assertEqual(traced_nontuple, expected_nontuple)
+        self.assertEqual(traced_out, expected_nontuple)
+
     @onlyOnCPUAndCUDA
     def test_nonzero_discontiguous(self, device):
         shape = (4, 4)
@@ -13705,6 +13809,15 @@ def test_float_scalar_pow_float_tensor(self, device):
         for base in floats:
             self._test_pow(base, tensor)
 
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    @dtypes(*(torch.testing.get_all_dtypes(include_bool=False, include_bfloat16=False)))
+    def test_complex_scalar_pow_tensor(self, device, dtype):
+        complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j]
+        tensor = torch.rand(100).to(dtype=dtype, device=device)
+        for base in complexes:
+            self._test_pow(base, tensor)
+
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
     def test_tensor_pow_tensor(self, dev):
         def rotate(l, n):
@@ -13812,15 +13925,6 @@ def call_torch_fn(*args, **kwargs):
         self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,)))
         self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,), test_out=True))
 
-        # ger, addr
-        self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape)
-        self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape)
-        self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape)
-
-        self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape)
-        self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape)
-        self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape)
-
         # bmm, baddbmm
         self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape)
         self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape)
@@ -14222,28 +14326,268 @@ def test_binary_op_scalar_device_unspecified(self, devices):
                     self.assertEqual(y1.device, device_obj)
                     self.assertEqual(y0, y1)
 
-    # Tests that CPU scalars (including zero dim tensors) can be used in
-    # binary operations with CUDA tensors.
-    @onlyCUDA
-    def test_cuda_cpu_scalar_binary_ops(self, device):
-        val_scalar = math.pi
-        val_tensor = torch.tensor(val_scalar)
-        for op in (operator.add, torch.add,
-                   operator.sub, torch.sub,
-                   operator.mul, torch.mul,
-                   operator.truediv, torch.true_divide,
-                   operator.floordiv, torch.floor_divide):
-            for tensor_val in (1, (1,)):
-                t_cuda = torch.tensor(tensor_val, device=device)
-                t_cpu = t_cuda.cpu()
-                for val in (val_scalar, val_tensor):
-                    cpu_result = op(t_cpu, val)
-                    cuda_result = op(t_cuda, val)
-                    self.assertEqual(cpu_result, cuda_result)
-
-                    reverse_cpu_result = op(val, t_cpu)
-                    reverse_cuda_result = op(val, t_cuda)
-                    self.assertEqual(reverse_cpu_result, reverse_cuda_result)
+    def test_div_and_floordiv_vs_python(self, device):
+        # Tests torch division ops which can handle both arguments being
+        #   scalars.
+        # NOTE: torch.floor_divide currently truncates instead of flooring.
+        #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+        def _scalar_helper(python_op, torch_op):
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0:
+                        continue
+
+                    expected = python_op(a, b)
+
+                    for op in (operator.truediv, torch.true_divide):
+                        actual_scalar = torch_op(a, b)
+
+                        a_t = torch.tensor(a, device=device)
+                        b_t = torch.tensor(b, device=device)
+
+                        actual_tensor = torch_op(a_t, b_t)
+                        actual_first_tensor = torch_op(a_t, b)
+                        actual_second_tensor = torch_op(a, b_t)
+
+                        self.assertEqual(actual_scalar, expected_div)
+                        self.assertEqual(actual_tensor.item(), expected_div)
+                        self.assertEqual(actual_first_tensor, actual_tensor)
+                        self.assertEqual(actual_second_tensor, actual_tensor)
+
+            _scalar_helper(operator.truediv, operator.truediv)
+            _scalar_helper(operator.truediv, torch.true_divide)
+            _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv)
+            _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide)
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring.
+    # See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_div_and_floordiv_script_vs_python(self, device):
+        # Creates jitted functions of two tensors
+        def _wrapped_div(a, b):
+            return a / b
+
+        def _wrapped_floordiv(a, b):
+            return a // b
+
+        scripted_div = torch.jit.script(_wrapped_div)
+        scripted_floordiv = torch.jit.script(_wrapped_floordiv)
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_div = a / b
+                expected_truncdiv = math.trunc(a / b)
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                self.assertEqual(scripted_div(a_t, b_t), expected_div)
+                self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv)
+
+        # Creates jitted functions of one tensor
+        def _wrapped_div_scalar(a):
+            return a / 5
+
+        # NOTE: this will fail when given an integer input, since
+        #   the JIT implements division as
+        #   torch.reciprocal(a) * 5, and reciprocal is only
+        #   implemented for float types.
+        def _wrapped_rdiv_scalar(a):
+            return 5 / a
+
+        def _wrapped_floordiv_scalar(a):
+            return a // 5
+
+        # NOTE: this fails if the input is not an integer tensor
+        # See https://github.com/pytorch/pytorch/issues/45199
+        def _wrapped_rfloordiv_scalar(a):
+            return 5 // a
+
+        scripted_div_scalar = torch.jit.script(_wrapped_div_scalar)
+        scripted_rdiv_scalar = torch.jit.script(_wrapped_rdiv_scalar)
+        scripted_floordiv_scalar = torch.jit.script(_wrapped_floordiv_scalar)
+        scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar)
+
+        for a in range(-10, 10):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+
+                a_t = torch.tensor(a, device=device)
+
+                self.assertEqual(a / 5, scripted_div_scalar(a_t))
+                self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t))
+
+                # Skips zero divisors
+                if a == 0:
+                    continue
+
+                if a_t.is_floating_point():
+                    self.assertEqual(5 / a, scripted_rdiv_scalar(a_t))
+                else:
+                    with self.assertRaises(RuntimeError):
+                        scripted_rdiv_scalar(a_t)
+
+
+                # Handles Issue 45199 (see comment above)
+                if a_t.is_floating_point():
+                    with self.assertRaises(RuntimeError):
+                        scripted_rfloordiv_scalar(a_t)
+                else:
+                    self.assertEqual(5 // a, scripted_rfloordiv_scalar(a_t))
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring
+    #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_idiv_and_ifloordiv_vs_python(self, device):
+        def _wrapped_idiv_tensor(a, b):
+            a /= b
+            return a
+
+        def _wrapped_idiv_scalar(a):
+            a /= 5
+            return a
+
+        def _wrapped_true_divide__tensor(a, b):
+            a.true_divide_(b)
+            return a
+
+        def _wrapped_true_divide__scalar(a):
+            a.true_divide_(5)
+            return a
+
+        def _wrapped_floor_divide__tensor(a, b):
+            a.floor_divide_(b)
+            return a
+
+        def _wrapped_floor_divide__scalar(a):
+            a.floor_divide_(5)
+            return a
+
+        # The following functions are unsupported by the JIT
+        def _wrapped_ifloordiv_tensor(a, b):
+            a //= b
+            return a
+
+        def _wrapped_ifloordiv_scalar(a):
+            a //= 5
+            return a
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_tensor = torch.jit.script(_wrapped_ifloordiv_tensor)
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_scalar = torch.jit.script(_wrapped_ifloordiv_scalar)
+
+        scripted_idiv_tensor = torch.jit.script(_wrapped_idiv_tensor)
+        scripted_idiv_scalar = torch.jit.script(_wrapped_idiv_scalar)
+        scripted_true_divide__tensor = torch.jit.script(_wrapped_true_divide__tensor)
+        scripted_true_divide__scalar = torch.jit.script(_wrapped_true_divide__scalar)
+        scripted_floor_divide__tensor = torch.jit.script(_wrapped_floor_divide__tensor)
+        scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar)
+
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_idiv = a / b
+                expected_ifloordiv = a // b
+                expected_itruncdiv = math.trunc(a / b)
+
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                if a_t.is_floating_point():
+                    tmp0 = a_t.clone()
+                    tmp0 /= b
+
+                    tmp1 = a_t.clone()
+                    tmp1 /= b_t
+
+                    self.assertEqual(tmp0.item(), expected_idiv)
+                    self.assertEqual(tmp1.item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5)
+                else:
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b_t
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__tensor(tmp, b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__scalar(tmp)
+
+
+                if not a_t.is_floating_point() and b_t.is_floating_point():
+                    # Inplace modification fails because a float tensor is required
+                    #   if the divisor is a float tensor
+                    with self.assertRaises(RuntimeError):
+                        a_t.clone().floor_divide_(b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_floor_divide_tensor(a_t.clone(), b_t)
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp //= b_t
+                else:
+                    # Inplace modification is OK when both or neither tensor is
+                    #   a float tensor
+                    self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv)
+                    self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv)
+                    tmp = a_t.clone()
+                    tmp //= b_t
+                    self.assertEqual(tmp.item(), expected_itruncdiv)
+
+                self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5))
+
+    # Tests binary op equivalence with Python builtin ops
+    # Also tests that reverse operations are equivalent to forward ops
+    # NOTE: division ops are tested separately above
+    def test_binary_ops_with_scalars(self, device):
+        for ops in ((operator.add, torch.add),
+                    (operator.sub, torch.sub),
+                    (operator.mul, torch.mul),
+                    (operator.truediv, torch.div)):
+            python_op, torch_op = ops
+
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0 or a == 0:
+                        continue
+
+                    a_tensor = torch.tensor(a, device=device)
+                    b_tensor = torch.tensor(b, device=device)
+                    a_tensor_cpu = a_tensor.cpu()
+                    b_tensor_cpu = b_tensor.cpu()
+                    vals = (a, b, a_tensor, b_tensor, a_tensor_cpu, b_tensor_cpu)
+
+                    for args in product(vals, vals):
+                        first, second = args
+
+                        first_scalar = first if not isinstance(first, torch.Tensor) else first.item()
+                        second_scalar = second if not isinstance(second, torch.Tensor) else second.item()
+                        expected = python_op(first_scalar, second_scalar)
+
+                        self.assertEqual(expected, python_op(first, second))
+                        self.assertEqual(expected, torch_op(first, second))
 
     @onlyCUDA
     def test_ceil_out_mismatch(self, device):
@@ -14404,8 +14748,7 @@ def test_topk_integral(self, device, dtype):
         self.assertEqual(sort_topk, topk[0])      # check values
         self.assertEqual(sort_topk, a[topk[1]])   # check indices
 
-    @dtypesIfCUDA(*([torch.half, torch.float, torch.double]
-                    + ([torch.bfloat16] if TEST_WITH_ROCM else [])))
+    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
     @dtypes(torch.float, torch.double)
     def test_topk_nonfinite(self, device, dtype):
         x = torch.tensor([float('nan'), float('inf'), 1e4, 0, -1e4, -float('inf')], device=device, dtype=dtype)
@@ -14712,6 +15055,8 @@ def _test_helper(x, y, bias, memory_format):
                 lambda x, y: x.logit_(1e-6),
                 lambda x, y: x.sign(),
                 lambda x, y: x.sign_(),
+                lambda x, y: x.sgn(),
+                lambda x, y: x.sgn_(),
                 lambda x, y: x.sin(),
                 lambda x, y: x.sin_(),
                 lambda x, y: x.sinh(),
@@ -16317,52 +16662,6 @@ def tracker(worker):
 ---(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})---
 '''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter))
 
-    @slowTest
-    @onlyCPU
-    @dtypes(torch.bfloat16, torch.float, torch.double)
-    def test_ger(self, device, dtype):
-        def run_test(v0, v1):
-            res0 = torch.ger(v0, v1)
-            res1 = torch.zeros(100, 100, dtype=dtype, device=device)
-            for i in range(100):
-                for j in range(100):
-                    res1[i, j] = v0[i] * v1[j]
-            self.assertEqual(res0, res1)
-
-        v0 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        run_test(v0, v1)
-
-        # Tests 0-strided
-        v0 = torch.randn(1, dtype=torch.float, device=device).expand(100).to(dtype=dtype)
-        v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        run_test(v0, v1)
-
-    @slowTest
-    @onlyCPU
-    @dtypes(torch.bfloat16, torch.float, torch.double)
-    def test_addr(self, device, dtype):
-        def run_test(m, v1, v2, m_transform=lambda x: x):
-            m = m_transform(m.clone())
-            ref = m.clone()
-            torch.addr(m, v1, v2, out=m)
-            for i in range(m.size(0)):
-                for j in range(m.size(1)):
-                    ref[i, j] += v1[i] * v2[j]
-            self.assertEqual(m, ref)
-
-        for h, w in [(100, 110), (1, 20), (200, 2)]:
-            m = torch.randn(h, w, dtype=torch.float, device=device).to(dtype=dtype)
-            v1 = torch.randn(h, dtype=torch.float, device=device).to(dtype=dtype)
-            v2 = torch.randn(w, dtype=torch.float, device=device).to(dtype=dtype)
-            run_test(m, v1, v2)
-            # test transpose
-            run_test(m, v2, v1, lambda x: x.transpose(0, 1))
-            # test 0 strided
-            v1 = torch.randn(1, dtype=torch.float, device=device).expand(h).to(dtype=dtype)
-            run_test(m, v1, v2)
-            run_test(m, v2, v1, lambda x: x.transpose(0, 1))
-
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
         dtype = t.dtype
         numpy_dtype = dtype
@@ -16388,7 +16687,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(),
+                  *([torch.float32, torch.float64, torch.bfloat16]
+                    if TEST_WITH_ROCM else torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_addmv(self, device, dtype):
@@ -16459,6 +16760,7 @@ def _test(row_major, incx, incy, lda_tail):
     @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes())
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         M = torch.randn(10, 25, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
@@ -16834,6 +17136,15 @@ def test_div(self, device, dtype):
                                  atol=0.01, rtol=0)
                 self.assertEqual(method(a1, a2), op(a1, a2))
 
+    @dtypes(torch.bfloat16, torch.float)
+    def test_true_divide_out(self, device, dtype):
+        a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device)
+        a2 = torch.tensor([2., 2.], dtype=dtype, device=device)
+        res = torch.empty_like(a1)
+        self.assertEqual(torch.true_divide(a1, a2, out=res),
+                         torch.tensor([2.1, 3.1], dtype=dtype, device=device),
+                         atol=0.01, rtol=0)
+
     @onlyCUDA
     @dtypes(torch.half)
     def test_divmul_scalar(self, device, dtype):
@@ -16904,11 +17215,8 @@ def test_rdiv(self, device, dtype):
         else:
             x = torch.rand(100, device=device).add(1).mul(4).to(dtype)
         y = 30 / x
-        if dtype.is_floating_point or dtype.is_complex:
-            z = torch.tensor([30 / v.item() for v in x], dtype=dtype, device=device)
-        else:
-            z = torch.tensor([math.trunc(30. / v.item()) for v in x], dtype=dtype, device=device)
-        self.assertEqual(y, z)
+        z = torch.tensor([30 / v.item() for v in x], device=device)
+        self.assertEqual(y, z, exact_dtype=False)
 
     @onlyCPU
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
@@ -19630,8 +19938,6 @@ def test_movedim_view(self, device):
 # with _float_types when bfloat16 bringup is complete on all platforms
 _float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types
 
-_complex_and_float_types2 = _float_types2 + _complex_types
-
 _signed_types = [
     torch.half, torch.float, torch.double,
     torch.int8, torch.short, torch.int, torch.long
@@ -19642,10 +19948,20 @@ def test_movedim_view(self, device):
     torch.int8, torch.short, torch.int, torch.long
 ]
 
+_integer_types = [
+    torch.uint8, torch.int8, torch.int16,
+    torch.int32, torch.int64
+]
+
 _cpu_types: List[torch.dtype] = []
 
 _unsigned_types = [torch.uint8]
 
+# Binary Float Ops
+# Operators which use TensorIterator::binary_float_op
+# These Ops promote integer inputs to Float.
+binary_float_ops_inplace = ['atan2_', 'div_']
+
 # Helper values and functions for producing tensors and scalars to use in tensor op tests.
 # Tensor dimension sizes (Small, Medium, Large, Giant)
 _S = 5
@@ -19802,20 +20118,21 @@ def inner(self, device, dtype):
     ('pow', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d).abs()],
         1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, [tf32_on_and_off(0.005)]),
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types,
+        _cpu_types, True, [tf32_on_and_off(0.01)]),
     ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2),
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)),
     ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
         [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]),
     ('baddbmm', 'two_scalars', _small_3d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
         [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]),
     ('bmm', '', _small_3d, lambda t, d: [_small_3d(t, d)],
         1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False),
@@ -19835,37 +20152,27 @@ def inner(self, device, dtype):
         [_wrap_maybe_warns("This overload of addcmul_? is deprecated")]),
     ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM),
-        _cpu_types, True, [tf32_on_and_off(0.005)], 0, True),
+        _cpu_types, True, [tf32_on_and_off(0.01)], 0, True),
     ('addmm', 'scalar', _medium_2d,
         lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
     ('addmm', 'two_scalars', _medium_2d,
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
     ('addmv', '', _medium_1d, lambda t, d: [_medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
-        True, [tf32_on_and_off(0.005)], 0, True),
+        True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
-        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]),
+        [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]),
-    ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2),
-    ('addr', 'scalar', _medium_2d,
-        lambda t, d: [_number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
-        [_wrap_maybe_warns("This overload of addr_? is deprecated")]),
-    ('addr', 'two_scalars', _medium_2d,
-        lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
-        [_wrap_maybe_warns("This overload of addr_? is deprecated")]),
-    ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _float_types),
+        [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
+    ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half),
     ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False),
     ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3),
     ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3),
@@ -20039,11 +20346,11 @@ def inner(self, device, dtype):
     ('transpose', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-1, -2], ),
     ('tolist', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('topk', 'dim_sort', _small_3d_unique, lambda t, d: [2, 1, False, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('topk', 'neg_dim_sort', _small_3d_unique, lambda t, d: [2, -1, False, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('topk', 'dim_desc_sort', _small_3d_unique, lambda t, d: [2, 1, True, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('trace', '', _medium_2d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _types, _cpu_types, False),
     ('tril', '', _medium_2d, lambda t, d: [],),
     ('tril', 'zero_stride', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
@@ -20103,7 +20410,7 @@ def inner(self, device, dtype):
     ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, 
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
         torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]),
     ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
@@ -20162,6 +20469,15 @@ def fn(self, device, dtype) -> None:
                            (isinstance(arg, torch.Tensor) and arg.dtype == torch.float) else arg
                            for arg in device_args]
 
+        # Special case for binary float ops (binary ops that promote int to float)
+        if op_str in binary_float_ops_inplace and \
+                'inplace' in subtest_str and dtype in _integer_types:
+            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "):
+                cpu_result = getattr(cpu_tensor, op_str)(*cpu_args)
+            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "):
+                device_result = getattr(device_tensor, op_str)(*device_args)
+            return  # Nothing more to check
+
         # Runs the tensor op on CPU and device
         cpu_result = getattr(cpu_tensor, op_str)(*cpu_args)
         device_result = getattr(device_tensor, op_str)(*device_args)
@@ -20531,7 +20847,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(*_float_types_no_half)
+    @dtypes(*(_float_types_no_half + _complex_types))
     def test_svd_square(self, device, dtype):
         self._test_svd_helper((10, 10), True, False, device, dtype)
 
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index 9ee90c7cbcd8..7f10915a5ac4 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -7,7 +7,7 @@
 from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests,
                                                   TEST_NUMPY, torch_to_numpy_dtype_dict)
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA,
-                                                        dtypes, onlyCPU)
+                                                        dtypes, dtypesIfCUDA, onlyCPU)
 
 if TEST_NUMPY:
     import numpy as np
@@ -958,6 +958,37 @@ def test_computation_ignores_out(self, device):
         self.assertEqual(result, a - b, exact_dtype=False)
         self.assertNotEqual(result, a.double() - b, exact_dtype=False)
 
+    @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False),
+                                     torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False)))
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
+                                                            include_complex=False),
+                               torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
+                                                            include_complex=False)))
+    def test_atan2_type_promotion(self, device, dtypes):
+        dtype1, dtype2 = dtypes
+        default_float = torch.get_default_dtype()
+
+        def is_int(dtype):
+            return dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+
+        def is_float(dtype):
+            return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False)
+
+        def get_binary_float_result_type(x, y):
+            dtype1 = x.dtype
+            dtype2 = y.dtype
+            if is_float(dtype1) and is_float(dtype2):
+                return torch.result_type(x, y)
+            elif is_float(dtype1) and is_int(dtype2):
+                return dtype1
+            elif is_int(dtype1) and is_float(dtype2):
+                return dtype2
+            elif is_int(dtype1) and is_int(dtype2):
+                return default_float
+
+        x = torch.tensor(1, dtype=dtype1, device=device)
+        y = torch.tensor(2, dtype=dtype2, device=device)
+        self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype)
 
 instantiate_device_type_tests(TestTypePromotion, globals())
 
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 09a3cbd583a7..ddc735199f2d 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1,6 +1,7 @@
 import math
 from itertools import product, chain
 from numbers import Number
+import random
 
 import unittest
 
@@ -377,6 +378,41 @@ def test_batch_vs_slicing(self, device, dtype, op):
 
         self.assertEqual(actual, expected)
 
+    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    def test_nan_to_num(self, device, dtype):
+        for contiguous in [False, True]:
+            x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device)
+
+            if dtype.is_floating_point:
+                # Add extremal values.
+                extremals = [float('nan'), float('inf'), -float('inf')]
+                for idx, extremal in zip(torch.randint(0, 63, (3,)), extremals):
+                    x[idx, :] = extremal
+
+            if not contiguous:
+                x = x.T
+
+            # With args
+            nan = random.random()
+            posinf = random.random() * 5
+            neginf = random.random() * 10
+
+            self.compare_with_numpy(lambda x: x.nan_to_num(nan=nan, posinf=posinf),
+                                    lambda x: np.nan_to_num(x, nan=nan, posinf=posinf),
+                                    x)
+            self.compare_with_numpy(lambda x: x.nan_to_num(posinf=posinf, neginf=neginf),
+                                    lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf),
+                                    x)
+
+            # Out Variant
+            out = torch.empty_like(x)
+            result = torch.nan_to_num(x)
+            torch.nan_to_num(x, out=out)
+            self.assertEqual(result, out)
+
+            result = torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+            torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf)
+            self.assertEqual(result, out)
 
 instantiate_device_type_tests(TestUnaryUfuncs, globals())
 
diff --git a/test/test_utils.py b/test/test_utils.py
index bf002541bebf..11b4337b4768 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,18 +4,20 @@
 import shutil
 import random
 import tempfile
+import textwrap
 import unittest
 import torch
 import torch.nn as nn
 import torch.utils.data
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
+from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, slowTest
 from urllib.error import URLError
+import numpy as np
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -621,34 +623,293 @@ def test_timer(self):
         timer = benchmark_utils.Timer(
             stmt="torch.ones(())",
         )
-        median = timer.blocked_autorange(min_run_time=0.1).median
+        sample = timer.timeit(5).median
+        self.assertIsInstance(sample, float)
+
+        median = timer.blocked_autorange(min_run_time=0.01).median
         self.assertIsInstance(median, float)
 
+        # We set a very high threshold to avoid flakiness in CI.
+        # The internal algorithm is tested in `test_adaptive_timer`
+        median = timer.adaptive_autorange(threshold=0.5).median
+
+    class _MockTimer:
+        _seed = 0
+
+        _timer_noise_level = 0.05
+        _timer_cost = 100e-9  # 100 ns
+
+        _function_noise_level = 0.05
+        _function_costs = (
+            ("pass", 8e-9),
+            ("cheap_fn()", 4e-6),
+            ("expensive_fn()", 20e-6),
+        )
+
+        def __init__(self, stmt, setup, timer, globals):
+            self._random_state = np.random.RandomState(seed=self._seed)
+            self._mean_cost = {k: v for k, v in self._function_costs}[stmt]
+
+        def sample(self, mean, noise_level):
+            return max(self._random_state.normal(mean, mean * noise_level), 5e-9)
+
+        def timeit(self, number):
+            return sum([
+                # First timer invocation
+                self.sample(self._timer_cost, self._timer_noise_level),
+
+                # Stmt body
+                self.sample(self._mean_cost * number, self._function_noise_level),
+
+                # Second timer invocation
+                self.sample(self._timer_cost, self._timer_noise_level),
+            ])
+
     def test_adaptive_timer(self):
-        # Validate both on different sizes validate against blocked_autorange
-        # This looks for relative differences btetween orders of magnitude to
-        # provide a stable/portable test which is somewhat informative.
-        timer = benchmark_utils.Timer(
-            stmt="torch.sum(torch.ones((10,10)))",
+        class MockTimer(benchmark_utils.Timer):
+            _timer_cls = self._MockTimer
+
+        def assert_reprs_match(measurement, expected):
+            measurement_repr = re.sub(
+                "object at 0x[0-9a-fA-F]+>",
+                "object at 0xXXXXXXXXXXXX>",
+                repr(measurement)
+            )
+            self.assertEqual(measurement_repr, textwrap.dedent(expected).strip())
+
+        assert_reprs_match(
+            MockTimer("pass").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            pass
+              Median: 7.98 ns
+              IQR:    0.52 ns (7.74 to 8.26)
+              125 measurements, 10000000 runs per measurement, 1 thread"""
         )
-        small = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0)
-        timer = benchmark_utils.Timer(
-            stmt="torch.sum(torch.ones((500,500)))",
+
+        assert_reprs_match(
+            MockTimer("pass").adaptive_autorange(),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            pass
+              Median: 7.86 ns
+              IQR:    0.71 ns (7.63 to 8.34)
+              6 measurements, 1000000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("cheap_fn()").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            cheap_fn()
+              Median: 3.98 us
+              IQR:    0.27 us (3.85 to 4.12)
+              252 measurements, 10000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("cheap_fn()").adaptive_autorange(),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            cheap_fn()
+              Median: 4.16 us
+              IQR:    0.22 us (4.04 to 4.26)
+              4 measurements, 1000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("expensive_fn()").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            expensive_fn()
+              Median: 19.97 us
+              IQR:    1.35 us (19.31 to 20.65)
+              501 measurements, 1000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("expensive_fn()").adaptive_autorange(),
+            """
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            expensive_fn()
+              Median: 20.79 us
+              IQR:    1.09 us (20.20 to 21.29)
+              4 measurements, 1000 runs per measurement, 1 thread"""
+        )
+
+        class _MockCudaTimer(self._MockTimer):
+            # torch.cuda.synchronize is much more expensive than
+            # just timeit.default_timer
+            _timer_cost = 10e-6
+
+            _function_costs = (
+                self._MockTimer._function_costs[0],
+                self._MockTimer._function_costs[1],
+
+                # GPU should be faster once there is enough work.
+                ("expensive_fn()", 5e-6),
+            )
+
+        class MockCudaTimer(benchmark_utils.Timer):
+            _timer_cls = _MockCudaTimer
+
+        configurations = (
+            (7.9903966e-09, 376, 1000000, MockTimer("pass")),
+            (7.8554826e-09, 4, 100000000, MockCudaTimer("pass")),
+            (3.9930536e-06, 752, 1000, MockTimer("cheap_fn()")),
+            (3.9441239e-06, 8, 100000, MockCudaTimer("cheap_fn()")),
+            (1.9994249e-05, 150, 1000, MockTimer("expensive_fn()")),
+            (4.9301076e-06, 6, 100000, MockCudaTimer("expensive_fn()")),
         )
-        medium = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0)
-        blocked_medium = timer.blocked_autorange(min_run_time=0.1)
-        self.assertLess(small.median, medium.median)
-        # This acts as a control to compare to a different way to measure the same value.
-        self.assertLess(small.median, blocked_medium.median)
+
+        for median, repeats, number_per_run, timer_instance in configurations:
+            measurement = timer_instance.blocked_autorange(min_run_time=3)
+            self.assertEqual(measurement.median, median)
+            self.assertEqual(len(measurement.times), repeats)
+            self.assertEqual(measurement.number_per_run, number_per_run)
+
+    @slowTest
+    @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
+    def test_collect_callgrind(self):
+        timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1")
+
+        # Don't collect baseline to speed up unit test by ~30 seconds.
+        stats = timer.collect_callgrind(number=1000, collect_baseline=False)
+
+        self.assertIsInstance(stats.counts(include_lookdict_unicode=False), int)
 
     def test_compare(self):
-        compare = benchmark_utils.Compare([
-            benchmark_utils.Timer(
-                "torch.ones((n,))", globals={"n": n},
-                description="ones", label=str(n)).timeit(3)
-            for n in range(3)
-        ])
-        compare.print()
+        # Simulate several approaches.
+        costs = (
+            # overhead_optimized_fn()
+            (1e-6, 1e-9),
+
+            # compute_optimized_fn()
+            (3e-6, 5e-10),
+
+            # special_case_fn()  [square inputs only]
+            (1e-6, 4e-10),
+        )
+
+        sizes = (
+            (16, 16),
+            (16, 128),
+            (128, 128),
+            (4096, 1024),
+            (2048, 2048),
+        )
+
+        # overhead_optimized_fn()
+        class _MockTimer_0(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j)
+                for i, j in sizes
+            )
+
+        class MockTimer_0(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_0
+
+        # compute_optimized_fn()
+        class _MockTimer_1(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j)
+                for i, j in sizes
+            )
+
+        class MockTimer_1(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_1
+
+        # special_case_fn()
+        class _MockTimer_2(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j)
+                for i, j in sizes if i == j
+            )
+
+        class MockTimer_2(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_2
+
+        results = []
+        for i, j in sizes:
+            results.append(
+                MockTimer_0(
+                    f"fn({i}, {j})",
+                    label="fn",
+                    description=f"({i}, {j})",
+                    sub_label="overhead_optimized",
+                ).blocked_autorange(min_run_time=10)
+            )
+
+            results.append(
+                MockTimer_1(
+                    f"fn({i}, {j})",
+                    label="fn",
+                    description=f"({i}, {j})",
+                    sub_label="compute_optimized",
+                ).blocked_autorange(min_run_time=10)
+            )
+
+            if i == j:
+                results.append(
+                    MockTimer_2(
+                        f"fn({i}, {j})",
+                        label="fn",
+                        description=f"({i}, {j})",
+                        sub_label="special_case (square)",
+                    ).blocked_autorange(min_run_time=10)
+                )
+
+        def check_output(output: str, expected: str):
+            # VSCode will strip trailing newlines from `expected`, so we have to match
+            # this behavior when comparing output.
+            output_str = "\n".join(
+                i.rstrip() for i in output.strip().splitlines(keepends=False))
+
+            self.assertEqual(output_str, textwrap.dedent(expected).strip())
+
+        compare = benchmark_utils.Compare(results)
+
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |    1.3     |     3.0     |     17.4     |     4174.4     |     4174.4
+                  compute_optimized      |    3.1     |     4.0     |     11.2     |     2099.3     |     2099.3
+                  special_case (square)  |    1.1     |             |      7.5     |                |     1674.7
+
+            Times are in microseconds (us)."""
+        )
+
+        compare.trim_significant_figures()
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |     1      |     3.0     |      17      |      4200      |      4200
+                  compute_optimized      |     3      |     4.0     |      11      |      2100      |      2100
+                  special_case (square)  |     1      |             |       8      |                |      1700
+
+            Times are in microseconds (us)."""
+        )
+
+        compare.colorize()
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |     1      |  \x1b[92m\x1b[1m   3.0   \x1b[0m\x1b[0m  |  \x1b[2m\x1b[91m    17    \x1b[0m\x1b[0m  |      4200      |  \x1b[2m\x1b[91m    4200    \x1b[0m\x1b[0m
+                  compute_optimized      |  \x1b[2m\x1b[91m   3    \x1b[0m\x1b[0m  |     4.0     |      11      |  \x1b[92m\x1b[1m    2100    \x1b[0m\x1b[0m  |      2100
+                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[92m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[92m\x1b[1m    1700    \x1b[0m\x1b[0m
+
+            Times are in microseconds (us)."""  # noqa
+        )
+
 
     @unittest.skipIf(IS_WINDOWS and os.getenv("VC_YEAR") == "2019", "Random seed only accepts int32")
     def test_fuzzer(self):
@@ -671,5 +932,13 @@ def test_fuzzer(self):
                 x, torch.Tensor(expected_results[i]), rtol=1e-3, atol=1e-3)
 
 
+class TestAssert(TestCase):
+    def test_assert_true(self):
+        # verify assertions work as expected
+        torch.Assert(True, "foo")
+        with self.assertRaisesRegex(AssertionError, "bar"):
+            torch.Assert(False, "bar")
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index a40ec48f2f37..56c44b904b47 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -12,10 +12,12 @@
 import io
 import itertools
 
+from torch.testing._internal.common_utils import TEST_WITH_TSAN
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKOps(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -161,6 +163,7 @@ def test_conv2d_transpose(self,
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKSerDes(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -551,6 +554,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKRewritePass(TestCase):
     @staticmethod
     def validate_transformed_module(
@@ -911,6 +915,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment")
 class TestXNNPACKConv1dTransformPass(TestCase):
     @staticmethod
     def validate_transform_conv1d_to_conv2d(
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 1d710393d5b7..fe9164007c33 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 1d710393d5b7588f5de3b83f51c22bbddf095229
+Subproject commit fe9164007c3392a12ea51a19b0f4e9f40d24f88d
diff --git a/third_party/fmt b/third_party/fmt
index 9bdd1596cef1..cd4af11efc9c 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit 9bdd1596cef1b57b9556f8bef32dc4a32322ef3e
+Subproject commit cd4af11efc9c622896a3e4cb599fa28668ca3d05
diff --git a/third_party/foxi b/third_party/foxi
index 9ca418d2f4bc..4aba696ec8f3 160000
--- a/third_party/foxi
+++ b/third_party/foxi
@@ -1 +1 @@
-Subproject commit 9ca418d2f4bc8e022d843388afa0fd0a14bd57dc
+Subproject commit 4aba696ec8f31794fd42880346dc586486205e0a
diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
index 195232556936..033d799524fb 160000
--- a/third_party/nccl/nccl
+++ b/third_party/nccl/nccl
@@ -1 +1 @@
-Subproject commit 195232556936b39b01cc908296e1650b80d4a3e9
+Subproject commit 033d799524fb97629af5ac2f609de367472b2696
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index 9646e1a43199..95ff9319161f 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 9646e1a431997edb1579972cef196d8fb97a77a5
+Subproject commit 95ff9319161fcdb3c674d2bb63fac3e94095b343
diff --git a/third_party/valgrind b/third_party/valgrind
new file mode 160000
index 000000000000..2593ccd82c18
--- /dev/null
+++ b/third_party/valgrind
@@ -0,0 +1 @@
+Subproject commit 2593ccd82c189bf40b60a3a4934c5d0bbdb75427
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index acecbe737e6d..026293a9281a 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import, division, print_function
+
 import os
 import argparse
 import sys
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9ee296e83035..2af8ee81604e 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -159,7 +159,7 @@
 # NB: The parameter names here MUST be consistent with the parameter names
 # in Decalarations.yaml
 - name: abs(Tensor self) -> Tensor
-  self: grad * self.sign()
+  self: grad * self.sgn()
 
 - name: acos(Tensor self) -> Tensor
   self: grad * -((-self * self + 1).rsqrt())
@@ -397,19 +397,19 @@
 # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414
 # Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later
 - name: div.Tensor(Tensor self, Tensor other) -> Tensor
-  self: grad / other
-  other: -grad * (self / other) / other
+  self: div_tensor_self_backward(grad, other, self.scalar_type())
+  other: div_tensor_other_backward(grad, self, other)
 
 - name: div.Scalar(Tensor self, Scalar other) -> Tensor
-  self: grad / other
+  self: div_tensor_self_backward(grad, at::scalar_to_tensor(other), self.scalar_type())
 
 - name: dot(Tensor self, Tensor tensor) -> Tensor
-  self: grad * tensor
-  tensor: grad * self
+  self: handle_r_to_c(self.scalar_type(), grad * tensor.conj())
+  tensor: handle_r_to_c(tensor.scalar_type(), grad * self.conj())
 
 - name: vdot(Tensor self, Tensor other) -> Tensor
-  self: 'not_implemented("vdot: self")'
-  other: 'not_implemented("vdot: other")'
+  self: handle_r_to_c(self.scalar_type(), grad.conj() * other)
+  other: handle_r_to_c(other.scalar_type(), grad * self)
 
 - name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   self: _fused_dropout_backward(grad, result1, p)
@@ -749,6 +749,9 @@
 - name: mvlgamma(Tensor self, int p) -> Tensor
   self: mvlgamma_backward(grad, self, p)
 
+- name: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  self: grad * at::isfinite(self)
+
 - name: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
@@ -887,7 +890,7 @@
   self: renorm_backward(grad, self, p, dim, maxnorm)
 
 - name: repeat(Tensor self, int[] repeats) -> Tensor
-  self: repeat_backward(grad, self.dim(), repeats)
+  self: repeat_backward(grad, repeats, self.sizes())
 
 # DO NOT define a backward for reshape!
 # reshape is special in that it sometimes returns a view, and sometimes not.
@@ -928,6 +931,9 @@
 - name: sign(Tensor self) -> Tensor
   self: zeros_like(grad)
 
+- name: sgn(Tensor self) -> Tensor
+  self: sgn_backward(result, grad, self)
+
 - name: sin(Tensor self) -> Tensor
   self: grad * self.cos().conj()
 
@@ -1218,9 +1224,9 @@
   self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
   target: non_differentiable
 
-- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  self: smooth_l1_loss_backward(grad, self, target, reduction)
-  target: smooth_l1_loss_backward(grad, target, self, reduction)
+- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
+  target: smooth_l1_loss_backward(grad, target, self, reduction, beta)
 
 - name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   self: soft_margin_loss_backward(grad, self, target, reduction)
@@ -1586,10 +1592,10 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
-  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
-  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
+  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
+  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
 
 - name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   grad_output: softplus_backward(grad, self, beta, threshold, output)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 82d908de6180..c12e9b2003d8 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -115,20 +115,6 @@ def has_tensoroptions_argument(declaration):
             return True
     return False
 
-def process_schema_order_arg(schema_order_arg):
-    if schema_order_arg == 'dtype':
-        return 'optTypeMetaToScalarType(options.dtype_opt())'
-    elif schema_order_arg == 'layout':
-        return 'options.layout_opt()'
-    elif schema_order_arg == 'device':
-        return 'options.device_opt()'
-    elif schema_order_arg == 'pin_memory':
-        return 'options.pinned_memory_opt()'
-    elif schema_order_arg == 'memory_format':
-        return 'c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)'
-    else:
-        return schema_order_arg
-
 
 def load_aten_declarations(path):
     with open(path, 'r') as f:
@@ -142,6 +128,8 @@ def load_aten_declarations(path):
 
         for arg in declaration['arguments']:
             arg['simple_type'] = get_simple_type(arg)
+        for arg in declaration['schema_order_arguments']:
+            arg['simple_type'] = get_simple_type(arg)
         for ret in declaration['returns']:
             ret['simple_type'] = get_simple_type(ret)
 
@@ -151,8 +139,6 @@ def load_aten_declarations(path):
                                                for arg in declaration['schema_order_arguments']]
         declaration['args'] = [arg['name'] for arg in declaration['arguments']]
         declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
-        if has_tensoroptions_argument(declaration):
-            declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']]
         declaration['api_name'] = declaration['name']
         if declaration.get('overload_name'):
             declaration['type_wrapper_name'] = "{}_{}".format(
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index b7fa4a3a8308..eb5de6f75ef5 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -281,6 +281,7 @@ def create_python_bindings(python_functions, is_python_method, module):
     'c10::optional<bool>': 'toBoolOptional',
     'c10::optional<double>': 'toDoubleOptional',
     'c10::optional<ArrayRef<double>>': 'doublelistOptional',
+    'ArrayRef<double>': 'doublelist',
     'IntArrayRef': 'intlist',
     'Scalar': 'scalar',
     'ScalarType': 'scalartype',
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 804da9193a50..6e0dc0721aed 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -22,7 +22,7 @@
 #     which will in turn dispatch back to VariableType for its
 #     differentiable subcomponents.
 #
-from __future__ import print_function
+
 from .utils import CodeTemplate, nested_dict, write, uninplace_api_name
 from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \
     MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT
@@ -71,12 +71,18 @@
 # arguments (inside of the `native_functions.yaml`)
 RENAME_TRACE_ADD_ARGS = {
     'fill': '''\
-    jit::tracer::addInputs(node, "options", TensorOptions());
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
 ''',
     'zero': '''\
-    jit::tracer::addInputs(node, "options", TensorOptions());
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
 ''',
@@ -139,7 +145,24 @@
     'quantize_per_tensor', 'quantize_per_channel',
     # Functions that return integers should not have output that require gradients
     'argmax', 'argmin', 'argsort', 'searchsorted',
-    'bucketize'
+    'bucketize',
+    # Functions that return booleans are not differentiable
+    'isnan', 'isposinf', 'isneginf', 'isinf'
+}
+
+# The C -> R functions at the time of adding this are still being audited and tested
+# but will not error out.
+# C -> C, R -> C functions for which backward is correctly implemented and tested
+GRADIENT_IMPLEMENTED_FOR_COMPLEX = {
+    't', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone',
+    'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
+    'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
+    'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_',
+    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinh',
+    'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
+    'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
+    'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
+    'dot', 'vdot', 'cholesky'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
@@ -232,14 +255,14 @@
 
 WRAPPER_REGISTRATION = CodeTemplate("""\
 m.impl("${unqual_operator_name_with_overload}",
-       c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(TORCH_FN(${class_type}::${type_wrapper_name}))
+       TORCH_FN(${class_type}::${type_wrapper_name})
 );
 """)
 
 UNPACK_TENSOR = CodeTemplate("""\
 auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
 
-UNPACK_OPTIONS = CodeTemplate("""\
+LEGACY_WRAP_OPTIONS = CodeTemplate("""\
 auto ${arg_name}_ = TensorOptions(${arg_name});""")
 
 DECLARE_GRAD_FN = CodeTemplate("""\
@@ -370,7 +393,8 @@
 
 # Generate a file that lists all functions and their schema string. Used for XLA
 REGISTRATION_DECLARATION = CodeTemplate("""\
-${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"}
+${return_type} ${api_name}(${declaration_formals}); \
+// {"schema": "${schema_string}", "compound": "${compound}", "has_math_kernel": "${has_math_kernel}"}
 """)
 
 # TraceType templates
@@ -490,15 +514,28 @@ def format_trace_op_name(declaration):
 
 
 def format_trace_inputs(declaration):
+    gather_tensor_options = "TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)"
+
     def dispatch_trace_input(arg_spec):
         name, value, simple_type, nullable = arg_spec
         # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs
         if simple_type == 'TensorList' and nullable:
             return '''jit::tracer::addInputs(node, "{}", {}, {});'''.format(name, value, "true")
         else:
-            return ADD_TRACE_INPUT.substitute(name=name, input=value)
+            if value == "options":
+                result = ""
+                result += ADD_TRACE_INPUT.substitute(name=name, input="optTypeMetaToScalarType(options.dtype_opt())") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.layout()") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.device()") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()")
+                return result
+            else:
+                return ADD_TRACE_INPUT.substitute(name=name, input=value)
 
-    trace_inputs = declaration['arguments']
+    if declaration['use_c10_dispatcher'] == 'full':
+        trace_inputs = declaration['schema_order_arguments']
+    else:
+        trace_inputs = declaration['arguments']
 
     if is_out_overload(declaration):
         # *_out functions take the result as a first argument, but they are the
@@ -506,7 +543,10 @@ def dispatch_trace_input(arg_spec):
         out_input = trace_inputs[0]
         trace_inputs = trace_inputs[1:]
 
-    trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs]
+    if declaration['use_c10_dispatcher'] == 'full':
+        trace_input_spec = [(i['name'], i['name'], i['type'], i.get('is_nullable')) for i in trace_inputs]
+    else:
+        trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs]
 
     trace_inputs = \
         '\n'.join(dispatch_trace_input(arg_spec) for arg_spec in trace_input_spec)
@@ -514,7 +554,8 @@ def dispatch_trace_input(arg_spec):
     if is_out_overload(declaration):
         # for *_out functions, handle the result argument differently for inplace/outplace.
         # For inplace: just add the input to the end to confirm with the JIT schema
-        inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=out_input['name'])
+        value = out_input['name']
+        inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=value)
 
         # for outplace: do nothing, except if the declaration is a factory.
         # Factories are a bit special because their out-of-place overloads
@@ -522,7 +563,11 @@ def dispatch_trace_input(arg_spec):
         trace_name = uninplace_api_name(declaration['api_name'])
         has_factory_name = trace_name in FACTORY_FUNCTION_NAMES
         if has_factory_name:
-            outplace = ADD_TRACE_INPUT.substitute(name='out', input='out.options()')
+            outplace = ""
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='optTypeMetaToScalarType(out.options().dtype_opt())') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().layout()') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().device()') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().pinned_memory()')
         else:
             outplace = ''
 
@@ -654,12 +699,12 @@ def gen_variable_type(out, aten_declarations, template_path):
             registration_declarations.append(
                 REGISTRATION_DECLARATION.substitute(declaration,
                                                     declaration_formals=declaration_formals,
-                                                    compound='false'))
+                                                    compound='False'))
         else:
             registration_declarations.append(
                 REGISTRATION_DECLARATION.substitute(declaration,
                                                     declaration_formals=declaration_formals,
-                                                    compound='true'))
+                                                    compound='True'))
 
     env = {
         'registration_declarations': registration_declarations,
@@ -680,12 +725,17 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade
 
     for declaration in aten_declarations:
         formal_types = [arg['type'] for arg in declaration['arguments']]
-        type_declarations.append(METHOD_DECLARATION.substitute(declaration))
+        if declaration['use_c10_dispatcher'] == 'full':
+            formals = declaration['schema_order_formals']
+        else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+            formals = declaration['formals']
+        type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals))
         strategy = dispatch_strategy(declaration)
         if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived':
             body = emit_body(declaration)
             type_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=body))
+                declaration, type_definition_body=body, formals=formals))
             if declaration['use_c10_dispatcher'] == 'full':
                 wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
                     declaration, class_type='VariableType'))
@@ -701,7 +751,7 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade
         if declaration['name'] not in MANUAL_TRACER:
             trace_body = emit_trace_body(declaration)
             trace_method_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=trace_body))
+                declaration, type_definition_body=trace_body, formals=formals))
 
             if declaration['use_c10_dispatcher'] == 'full':
                 trace_wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
@@ -925,6 +975,16 @@ def setup_derivative(differentiable_inputs):
         body.append(SETUP_DERIVATIVE.substitute(env, setup=setup))
         return body
 
+    def emit_check_if_in_complex_autograd_allowlist():
+        body = []
+        if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
+            return body
+        for arg in differentiable_outputs:
+            name = arg['name']
+            if arg['type'] == 'Tensor' or arg['type'] == 'TensorList':
+                body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name))
+        return body
+
     def emit_check_no_requires_grad(tensor_args, args_with_derivatives):
         """Checks that arguments without derivatives don't require grad"""
         body = []
@@ -1182,6 +1242,7 @@ def emit_increment_version():
         body.append(emit_history())
     if requires_derivative:
         body.append(emit_save_outputs())
+        body.extend(emit_check_if_in_complex_autograd_allowlist())
     if base_name in RESET_GRAD_ACCUMULATOR:
         # `inplace` implies that there is exactly one output named `self`,
         # so we can keep the generated code easy. If you need to
@@ -1201,7 +1262,12 @@ def requires_unpack(arg):
     body = []
     unpacked_args = []
     unpacked_args_simple_type = {}
-    for i, arg in enumerate(declaration['arguments']):
+    if declaration['use_c10_dispatcher'] == 'full':
+        arguments = declaration['schema_order_arguments']
+    else:
+        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+        arguments = declaration['arguments']
+    for i, arg in enumerate(arguments):
         if not requires_unpack(arg):
             unpacked_args.append(arg['name'])
             unpacked_args_simple_type[arg['name']] = arg['simple_type']
@@ -1223,7 +1289,9 @@ def requires_unpack(arg):
             # Okay, we are abusing the definition of 'unpack' here a bit,
             # although it's still getting the non-variable from the variable
             # (in this case via TensorOptions rather than Variable/Tensor).
-            body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name']))
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \
+                "VariableKernel shouldn't take TensorOptions if the op is c10-full"
+            body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name']))
 
         unpacked_args.append(arg['name'] + '_')
         unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type']
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index efddffbe7610..079427cd97dc 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -3,7 +3,6 @@
 
 #include <ATen/TypeDefault.h>
 #include <torch/library.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 // ${generated_comment}
 
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 7d0186538c98..1dbdca565792 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -7,14 +7,27 @@
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#include <ATen/ATen.h>
 
 using at::Tensor;
+using at::Device;
+using at::Layout;
 using at::Scalar;
-using at::MemoryFormat;
-using at::Generator;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
 using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
 
 using namespace torch::autograd::utils;
 
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 62e9b8dd227f..aac41111e1bf 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -44,6 +44,7 @@ using at::Generator;
 using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
+using at::ArrayRef;
 
 using namespace torch::autograd::utils;
 
@@ -582,29 +583,29 @@ static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject*
 {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-    "nonzero(Tensor input, *, Tensor out=None)|deprecated",
-    "nonzero(Tensor input, *, bool as_tuple)",
+    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
   });
-  ParsedArgs<2> parsed_args;
+  ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
 
   if(r.has_torch_function()){
     return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
   }
 
-  if (r.idx == 0) {
-    if (r.isNone(1)) {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0), r.tensor(1)));
-    }
-  } else {
-    if (r.toBool(1)) {
-      return wrap(dispatch_nonzero_numpy(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    }
+  const auto as_tuple = r.toBool(1);
+  const auto has_out = !r.isNone(2);
+
+  if (as_tuple) {
+    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
+    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
+  }
+
+  if (has_out) {
+    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
   }
+
+  return wrap(dispatch_nonzero(r.tensor(0)));
+
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 3cc3585aa555..96301611c2e5 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -91,11 +91,7 @@ core_sources_common = [
     "torch/csrc/jit/serialization/unpickler.cpp",
 ]
 
-jit_sources_common = [
-    "torch/csrc/jit/runtime/register_prim_ops_c10.cpp",
-]
-
-libtorch_sources_common = core_sources_common + jit_sources_common
+libtorch_sources_common = core_sources_common
 
 core_trainer_sources = [
     "torch/csrc/autograd/anomaly_mode.cpp",
@@ -152,6 +148,7 @@ core_sources_full = [
     "torch/csrc/jit/ir/scope.cpp",
     "torch/csrc/jit/ir/subgraph_matcher.cpp",
     "torch/csrc/jit/jit_log.cpp",
+    "torch/csrc/jit/passes/annotate_warns.cpp",
     "torch/csrc/jit/passes/bailout_graph.cpp",
     "torch/csrc/jit/passes/batch_mm.cpp",
     "torch/csrc/jit/passes/canonicalize.cpp",
@@ -223,6 +220,7 @@ core_sources_full = [
     "torch/csrc/jit/runtime/profiling_record.cpp",
     "torch/csrc/jit/runtime/symbolic_script.cpp",
     "torch/csrc/jit/runtime/static/impl.cpp",
+    "torch/csrc/jit/runtime/static/ops.cpp",
     "torch/csrc/jit/serialization/import.cpp",
     "torch/csrc/jit/serialization/import_export_helpers.cpp",
     "torch/csrc/jit/serialization/import_source.cpp",
@@ -233,7 +231,6 @@ core_sources_full = [
     "torch/csrc/jit/tensorexpr/codegen.cpp",
     "torch/csrc/jit/tensorexpr/eval.cpp",
     "torch/csrc/jit/tensorexpr/expr.cpp",
-    "torch/csrc/jit/tensorexpr/function.cpp",
     "torch/csrc/jit/tensorexpr/hash_provider.cpp",
     "torch/csrc/jit/tensorexpr/ir.cpp",
     "torch/csrc/jit/tensorexpr/ir_mutator.cpp",
@@ -302,12 +299,11 @@ jit_sources_full = [
     "torch/csrc/jit/runtime/register_prim_ops.cpp",
     "torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp",
     "torch/csrc/jit/runtime/register_special_ops.cpp",
-    "torch/csrc/jit/runtime/register_string_ops.cpp",
     "torch/csrc/jit/passes/remove_inplace_ops.cpp",
     "torch/csrc/jit/passes/utils/check_alias_annotation.cpp",
 ]
 
-libtorch_core_jit_sources = sorted(jit_sources_common + jit_sources_full)
+libtorch_core_jit_sources = sorted(jit_sources_full)
 
 libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources
 
@@ -344,6 +340,7 @@ libtorch_cuda_sources = [
     "torch/csrc/autograd/functions/comm.cpp",
     "torch/csrc/jit/codegen/cuda/arith.cpp",
     "torch/csrc/jit/codegen/cuda/compute_at.cpp",
+    "torch/csrc/jit/codegen/cuda/codegen.cpp",
     "torch/csrc/jit/codegen/cuda/dispatch.cpp",
     "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp",
     "torch/csrc/jit/codegen/cuda/executor.cpp",
@@ -353,6 +350,7 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/fusion.cpp",
     "torch/csrc/jit/codegen/cuda/graph_fuser.cpp",
     "torch/csrc/jit/codegen/cuda/index_compute.cpp",
+    "torch/csrc/jit/codegen/cuda/instrumentation.cpp",
     "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp",
     "torch/csrc/jit/codegen/cuda/ir_cloner.cpp",
     "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp",
@@ -362,8 +360,10 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/kernel.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_ir.cpp",
+    "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp",
     "torch/csrc/jit/codegen/cuda/lower_index.cpp",
     "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
     "torch/csrc/jit/codegen/cuda/lower_unroll.cpp",
     "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp",
     "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
@@ -542,11 +542,14 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
 ]
 
-libtorch_python_distributed_sources = [
-    "torch/csrc/distributed/autograd/init.cpp",
+libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
+]
+
+libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
+    "torch/csrc/distributed/autograd/init.cpp",
     "torch/csrc/distributed/rpc/init.cpp",
     "torch/csrc/distributed/rpc/process_group_agent.cpp",
     "torch/csrc/distributed/rpc/py_rref.cpp",
@@ -575,48 +578,4 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
     _libtorch_python_sources.extend(libtorch_python_core_sources)
     _libtorch_python_sources.extend(libtorch_python_distributed_sources)
 
-    _libtorch_python_sources.extend([
-        "test/cpp/jit/torch_python_test.cpp",
-        "test/cpp/tensorexpr/padded_buffer.cpp",
-        "test/cpp/jit/test_alias_analysis.cpp",
-        "test/cpp/jit/test_argument_spec.cpp",
-        "test/cpp/jit/test_autodiff.cpp",
-        "test/cpp/jit/test_backend.cpp",
-        "test/cpp/jit/test_base.cpp",
-        "test/cpp/jit/test_class_import.cpp",
-        "test/cpp/jit/test_class_parser.cpp",
-        "test/cpp/jit/test_class_type.cpp",
-        "test/cpp/jit/test_code_template.cpp",
-        "test/cpp/jit/test_constant_pooling.cpp",
-        "test/cpp/jit/test_cleanup_passes.cpp",
-        "test/cpp/jit/test_create_autodiff_subgraphs.cpp",
-        "test/cpp/jit/test_custom_class.cpp",
-        "test/cpp/jit/test_custom_operators.cpp",
-        "test/cpp/jit/test_dce.cpp",
-        "test/cpp/jit/test_fuser.cpp",
-        "test/cpp/jit/test_gpu.cpp",
-        "test/cpp/jit/test_graph_executor.cpp",
-        "test/cpp/jit/test_inliner.cpp",
-        "test/cpp/jit/test_interface.cpp",
-        "test/cpp/jit/test_interpreter.cpp",
-        "test/cpp/jit/test_ir.cpp",
-        "test/cpp/jit/test_irparser.cpp",
-        "test/cpp/jit/test_jit_type.cpp",
-        "test/cpp/jit/test_lite_interpreter.cpp",
-        "test/cpp/jit/test_lite_trainer.cpp",
-        "test/cpp/jit/test_misc.cpp",
-        "test/cpp/jit/test_mobile_type_parser.cpp",
-        "test/cpp/jit/test_module_api.cpp",
-        "test/cpp/jit/test_peephole_optimize.cpp",
-        "test/cpp/jit/test_qualified_name.cpp",
-        "test/cpp/jit/test_save_load.cpp",
-        "test/cpp/jit/test_schema_matching.cpp",
-        "test/cpp/jit/test_subgraph_matcher.cpp",
-        "test/cpp/jit/test_subgraph_rewriter.cpp",
-        "test/cpp/jit/test_subgraph_utils.cpp",
-        "test/cpp/jit/test_utils.cpp",
-    ])
-
-    _libtorch_python_sources.extend(native.glob(["test/cpp/tensorexpr/test_*.cpp"]))
-
     return _libtorch_python_sources
diff --git a/tools/clang_format_all.py b/tools/clang_format_all.py
index 710a21e33514..77ca68d92b0b 100755
--- a/tools/clang_format_all.py
+++ b/tools/clang_format_all.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-A script that runs clang-format on all C/C++ files in CLANG_FORMAT_WHITELIST. There is
+A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is
 also a diff mode which simply checks if clang-format would make any changes, which is useful for
 CI purposes.
 
@@ -14,22 +14,22 @@
 import sys
 from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH
 
-# Whitelist of directories to check. All files that in that directory
+# Allowlist of directories to check. All files that in that directory
 # (recursively) will be checked.
-# If you edit this, please edit the whitelist in clang_format_ci.sh as well.
-CLANG_FORMAT_WHITELIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
+# If you edit this, please edit the allowlist in clang_format_ci.sh as well.
+CLANG_FORMAT_ALLOWLIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
 
 # Only files with names matching this regex will be formatted.
 CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$")
 
 
-def get_whitelisted_files():
+def get_allowlisted_files():
     """
-    Parse CLANG_FORMAT_WHITELIST and resolve all directories.
-    Returns the set of whitelist cpp source files.
+    Parse CLANG_FORMAT_ALLOWLIST and resolve all directories.
+    Returns the set of allowlist cpp source files.
     """
     matches = []
-    for dir in CLANG_FORMAT_WHITELIST:
+    for dir in CLANG_FORMAT_ALLOWLIST:
         for root, dirnames, filenames in os.walk(dir):
             for filename in filenames:
                 if CPP_FILE_REGEX.match(filename):
@@ -77,7 +77,7 @@ async def file_clang_formatted_correctly(filename, semaphore, verbose=False):
 
 async def run_clang_format(max_processes, diff=False, verbose=False):
     """
-    Run clang-format to all files in CLANG_FORMAT_WHITELIST that match CPP_FILE_REGEX.
+    Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX.
     """
     # Check to make sure the clang-format binary exists.
     if not os.path.exists(CLANG_FORMAT_PATH):
@@ -97,7 +97,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
 
     # Format files in parallel.
     if diff:
-        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_whitelisted_files()]):
+        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]):
             ok &= await f
 
         if ok:
@@ -105,7 +105,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
         else:
             print("Some files not formatted correctly")
     else:
-        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_whitelisted_files()])
+        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()])
 
     return ok
 
@@ -134,7 +134,7 @@ def main(args):
     options = parse_args(args)
     # Get clang-format and make sure it is the right binary and it is in the right place.
     ok = get_and_check_clang_format(options.verbose)
-    # Invoke clang-format on all files in the directories in the whitelist.
+    # Invoke clang-format on all files in the directories in the allowlist.
     if ok:
         loop = asyncio.get_event_loop()
         ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose))
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
index 354aedc601ad..f8e8e61857e5 100755
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
@@ -12,7 +12,7 @@
 glob or regular expressions.
 """
 
-from __future__ import print_function
+
 
 import argparse
 import collections
diff --git a/tools/code_analyzer/gen_op_registration_whitelist.py b/tools/code_analyzer/gen_op_registration_allowlist.py
similarity index 94%
rename from tools/code_analyzer/gen_op_registration_whitelist.py
rename to tools/code_analyzer/gen_op_registration_allowlist.py
index 5971864b2187..56e0f78cc1b5 100644
--- a/tools/code_analyzer/gen_op_registration_whitelist.py
+++ b/tools/code_analyzer/gen_op_registration_allowlist.py
@@ -1,11 +1,11 @@
 """
-This util is invoked from cmake to produce the op registration whitelist param
+This util is invoked from cmake to produce the op registration allowlist param
 for `ATen/gen.py` for custom mobile build.
 For custom build with dynamic dispatch, it takes the op dependency graph of ATen
 and the list of root ops, and outputs all transitive dependencies of the root
-ops as the whitelist.
+ops as the allowlist.
 For custom build with static dispatch, the op dependency graph will be omitted,
-and it will directly output root ops as the whitelist.
+and it will directly output root ops as the allowlist.
 """
 
 import argparse
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index 452c3721ab92..538ba3596c7d 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,7 +1,9 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import TensorOptionsArguments, CppArgument, ThisArgument
 import tools.codegen.local as local
-from typing import Optional, Sequence, Union, Callable, List
+from typing import Optional, Sequence, Union, Callable, List, Tuple
+import copy
+from dataclasses import dataclass
 
 # This file describes the translation of JIT schema to the public C++
 # API, which is what people use when they call functions like at::add.
@@ -71,9 +73,6 @@ def argumenttype_type(t: Type, *, mutable: bool) -> str:
     if r is not None:
         return r
 
-    if str(t) == 'Tensor' and mutable and local.hack_const_mutable_self():
-        return 'const Tensor &'
-
     if isinstance(t, BaseType):
         if t.name == BaseTy.Tensor:
             if mutable:
@@ -155,6 +154,7 @@ def returns_type(rs: Sequence[Return]) -> str:
     '[]': '{}',
     '[0,1]': '{0,1}',  # TODO: stop special casing
     'contiguous_format': 'MemoryFormat::Contiguous',
+    'long': 'at::kLong',
 }
 
 # Convert a JIT default into C++ expression representing the default
@@ -194,9 +194,50 @@ def argument(a: Union[Argument, TensorOptionsArguments, ThisArgument]) -> CppArg
     else:
         assert_never(a)
 
-def group_arguments(
-    func: FunctionSchema, *, method: bool = False
-) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]:
+@dataclass(frozen=True)
+class CppSignature:
+    returns: Tuple[Return, ...]
+    arguments: Tuple[Union[Argument, TensorOptionsArguments, ThisArgument], ...]
+
+    def cpp_arguments(self) -> Sequence[CppArgument]:
+        return list(map(argument, self.arguments))
+
+    # Return arguments as a comma separated list, i.e. like they would be in a C++
+    # function signature. Include default values for arguments.
+    def cpp_arguments_str(self, with_defaults: bool) -> str:
+        args_without_this = [argument(a) for a in self.arguments if not isinstance(a, ThisArgument)]
+        if with_defaults:
+            return ', '.join(map(str, args_without_this))
+        else:
+            return ', '.join(map(lambda s: s.str_no_default(), args_without_this))
+
+
+@dataclass(frozen=True)
+class CppSignatureGroup:
+    # arguments contains the arguments for the C++ signature as it is represented
+    # in the JIT schema.
+    signature: CppSignature
+
+    # gathered_signature is an alternative C++ signature in which TensorOptions are
+    # gathered into one TensorOptions object instead of being scattered into
+    # ScalarType, Layout, Device. This is only present for factory operators,
+    # other operators have this set to None. This can be used to generate a
+    # convenience API in the C++ frontend so users can call using TensorOptions objects.
+    gathered_signature: Optional[CppSignature]
+
+    # If it is a factory op, this returns the arguments for the convenience API
+    # that takes TensorOptions. If it is not a factory op and doesn't have
+    # a gathered signature, then this returns the regular signature instead.
+    def signature_prefer_gathered(self) -> CppSignature:
+        if self.gathered_signature is not None:
+            return self.gathered_signature
+        else:
+            return self.signature
+
+
+def signature_group(
+    func: FunctionSchema, *, method: bool = False,
+) -> CppSignatureGroup:
     args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = []
     args.extend(func.out_arguments)
 
@@ -205,8 +246,9 @@ def group_arguments(
     else:
         args.extend(func.arguments)
 
-    # group up arguments for tensor options
+    gathered_args = copy.deepcopy(args)
 
+    # group up arguments for tensor options
     def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
         return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
     predicates = [  # order matters
@@ -216,14 +258,16 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
         pred('pin_memory', Type.parse('bool')),
     ]
 
+    has_tensoroptions_argument = False
     i = 0
     while i < len(func.kwarg_only_arguments):
         # If there is enough space...
         if i <= len(func.kwarg_only_arguments) - len(predicates):
             # And the next len(predicates) arguments look like TensorOptions arguments
             if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])):
+                has_tensoroptions_argument = True
                 # Group them together as one argument
-                args.append(TensorOptionsArguments(
+                gathered_args.append(TensorOptionsArguments(
                     dtype=func.kwarg_only_arguments[i],
                     layout=func.kwarg_only_arguments[i + 1],
                     device=func.kwarg_only_arguments[i + 2],
@@ -231,11 +275,19 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
                 ))
                 i += len(predicates)
                 continue
-        args.append(func.kwarg_only_arguments[i])
+        gathered_args.append(func.kwarg_only_arguments[i])
         i += 1
 
-    return args
+    args.extend(func.kwarg_only_arguments)
 
-# Convert arguments to C++ API form
-def arguments(func: FunctionSchema, *, method: bool = False) -> Sequence[CppArgument]:
-    return list(map(argument, group_arguments(func, method=method)))
+    if has_tensoroptions_argument:
+        return CppSignatureGroup(
+            signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)),
+            gathered_signature=CppSignature(arguments=tuple(gathered_args), returns=tuple(func.returns)),
+        )
+    else:
+        assert gathered_args == args
+        return CppSignatureGroup(
+            signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)),
+            gathered_signature=None,
+        )
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 34960534275f..6cb141c22f99 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -2,10 +2,10 @@
 
 from tools.codegen.api.types import CppArgument, DispatcherExpr, TensorOptionsArguments, \
     DispatcherArgument, ThisArgument, LegacyDispatcherArgument
-import tools.codegen.api.cpp as cpp
+from tools.codegen.api import cpp
 import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
 import tools.codegen.local as local
-
+from enum import Enum
 import itertools
 from typing import Sequence, Optional
 
@@ -63,6 +63,9 @@ def argument(a: Argument) -> DispatcherArgument:
             argument=la.argument,
         )
 
+def name(func: FunctionSchema) -> str:
+    return cpp.name(func)
+
 def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]:
     if local.use_c10_dispatcher() is UseC10Dispatcher.full:
         return list(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments)))
@@ -72,11 +75,19 @@ def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]:
             for la in legacy_dispatcher.arguments(func)
         ]
 
+# TODO GATHER is only needed for non-c10-full ops, remove later.
+ProcessTensoroptions = Enum('ProcessTensoroptions', ('GATHER', 'SCATTER', 'PASS_THROUGH'))
+
+
 # Given a set of CppArguments in scope, return a sequence of dispatcher
 # expressions that translate the cpp API into dispatcher API
-def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) -> Sequence[DispatcherExpr]:
+def cppargument_exprs(a: CppArgument,
+                      *,
+                      tensor_options: Optional[CppArgument],
+                      process_tensoroptions: ProcessTensoroptions = ProcessTensoroptions.PASS_THROUGH
+                      ) -> Sequence[DispatcherExpr]:
     if isinstance(a.argument, TensorOptionsArguments):
-        if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        if process_tensoroptions == ProcessTensoroptions.SCATTER:
             ta = a.argument
             return [
                 DispatcherExpr(type=argument_type(ta.dtype), expr=f'optTypeMetaToScalarType({a.name}.dtype_opt())'),
@@ -84,8 +95,16 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument])
                 DispatcherExpr(type=argument_type(ta.device), expr=f'{a.name}.device_opt()'),
                 DispatcherExpr(type=argument_type(ta.pin_memory), expr=f'{a.name}.pinned_memory_opt()'),  # weird discrep
             ]
+        elif process_tensoroptions == ProcessTensoroptions.GATHER:
+            return [
+                DispatcherExpr(
+                    type='const TensorOptions &',
+                    expr="TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)")]
         else:
+            assert process_tensoroptions == ProcessTensoroptions.PASS_THROUGH
             return [DispatcherExpr(type='const TensorOptions &', expr=a.name)]
+    elif isinstance(a.argument, ThisArgument):
+        return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)]
     elif isinstance(a.argument, Argument):
         if a.name == 'memory_format' and tensor_options is not None and local.use_c10_dispatcher() is UseC10Dispatcher.full:
             return [DispatcherExpr(
@@ -94,16 +113,35 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument])
             ]
         else:
             return [DispatcherExpr(type=argument_type(a.argument), expr=a.name)]
-    elif isinstance(a.argument, ThisArgument):
-        return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)]
     else:
         assert_never(a.argument)
 
-def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]:
+def cpparguments_exprs(args: Sequence[CppArgument], process_tensoroptions: ProcessTensoroptions) -> Sequence[DispatcherExpr]:
     tensor_options = next((a for a in args if isinstance(a.argument, TensorOptionsArguments)), None)
-    return [r for a in args for r in cppargument_exprs(a, tensor_options=tensor_options)]
+    return [r for a in args for r in cppargument_exprs(a,
+                                                       tensor_options=tensor_options,
+                                                       process_tensoroptions=process_tensoroptions)]
 
 # I don't think this is entirely sound, but it should be reasonably
 # close
 def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]:
-    return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        process_tensoroptions = ProcessTensoroptions.SCATTER
+    else:
+        process_tensoroptions = ProcessTensoroptions.PASS_THROUGH
+    return cpparguments_exprs([CppArgument(type=a.type,
+                                           name=a.name,
+                                           default=None,
+                                           argument=a.argument) for a in args],
+                              process_tensoroptions=process_tensoroptions)
+
+def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]:
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        process_tensoroptions = ProcessTensoroptions.SCATTER
+    else:
+        process_tensoroptions = ProcessTensoroptions.PASS_THROUGH
+    return cpparguments_exprs([CppArgument(type=a.type,
+                                           name=a.name,
+                                           default=None,
+                                           argument=a.argument) for a in args],
+                              process_tensoroptions=process_tensoroptions)
diff --git a/tools/codegen/api/legacy_dispatcher.py b/tools/codegen/api/legacy_dispatcher.py
index db3d26c84fd0..160d39495951 100644
--- a/tools/codegen/api/legacy_dispatcher.py
+++ b/tools/codegen/api/legacy_dispatcher.py
@@ -71,4 +71,6 @@ def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Legacy
         assert_never(a)
 
 def arguments(func: FunctionSchema) -> Sequence[LegacyDispatcherArgument]:
-    return list(map(argument, cpp.group_arguments(func)))
+    signature_group = cpp.signature_group(func)
+    args = signature_group.signature_prefer_gathered().arguments
+    return list(map(argument, args))
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index e4acb369f08e..48a2b3f56702 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -2,7 +2,7 @@
 import contextlib
 import textwrap
 import itertools
-from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence
+from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, Union, Sequence
 import yaml
 from enum import Enum
 from collections import OrderedDict
@@ -14,6 +14,7 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
+from tools.codegen.api.cpp import CppSignature
 import tools.codegen.api.dispatcher as dispatcher
 import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
 import tools.codegen.local as local
@@ -46,14 +47,6 @@
 #     the dispatcher API, and the legacy disaptcher API.  See each
 #     of these respective files for more information
 
-
-# Note [Byte-for-byte compatibility]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Some special cases we have made in this codegen have been strictly
-# to make sure that git diff -w reports no changes, but we believe
-# they are not semantically meaningful.  After landing the new codegen,
-# we should remove these special cases
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                         HELPER FUNCTIONS
@@ -111,8 +104,6 @@ def wrapper(f: NativeFunction) -> T:
         with context(f'in {f.loc}:\n  {f.func}'):
             with local.parametrize(
                 use_c10_dispatcher=f.use_c10_dispatcher,
-                # See Note [Byte-for-byte compatibility]
-                hack_const_mutable_self=str(f.func.name) in ["set_data", "retain_grad"],
             ):
                 return func(f)
     return wrapper
@@ -224,11 +215,7 @@ def func(f: NativeFunction) -> Optional[str]:
 
             args_exprs_str = ', '.join(map(lambda a: a.name, args))
 
-            # See Note [Byte-for-byte compatibility]
-            # (return void_func() is valid C++)
             return_kw = "    return "
-            if returns_type == "void":
-                return_kw = " "
 
             cuda_guard = ""
             if dispatch is None or 'CUDA' in dispatch or 'Vulkan' == dispatch:
@@ -241,14 +228,6 @@ def func(f: NativeFunction) -> Optional[str]:
                 # Only tensor like arguments are eligible
                 device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
 
-                # See Note [Byte-for-byte compatibility]
-                # I wasn't able to figure out the internal logic for
-                # these device guards
-                if str(f.func.name) == "_thnn_fused_lstm_cell_backward":
-                    device_of = "cx"
-                elif str(f.func.name) == "_thnn_differentiable_lstm_cell_backward":
-                    device_of = "input_gates"
-
                 has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args)
 
                 # TODO: There is probably a simpler version of this that
@@ -257,9 +236,6 @@ def func(f: NativeFunction) -> Optional[str]:
                     cuda_guard = """\
     const DeviceGuard device_guard(options.device());
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
                 elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options:
                     cuda_guard = """\
     globalContext().lazyInitCUDA();
@@ -269,16 +245,10 @@ def func(f: NativeFunction) -> Optional[str]:
                     cuda_guard = f"""\
     const OptionalDeviceGuard device_guard(device_of({device_of}));
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
                 else:
                     cuda_guard = """\
     // DeviceGuard omitted
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
 
             return f"""\
 {returns_type} {name}({args_str}) {{
@@ -290,7 +260,7 @@ def func(f: NativeFunction) -> Optional[str]:
             assert returns_type == dispatcher.returns_type(f.func.returns)
             dispatcher_args = dispatcher.arguments(f.func)
             dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args))
-            if dispatch is None:
+            if dispatch is None or dispatch == 'Math':
                 type_name = f'TypeDefault::{name}'
             else:
                 type_name = f'{dispatch}Type::{name}'
@@ -304,14 +274,9 @@ def func(f: NativeFunction) -> Optional[str]:
             if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None):
                 # Figure out which signature the function is
                 if local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        nl = "\n"
-                    else:
-                        nl = ""
 
                     payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \
-                        f"{returns_type} ({dispatcher_args_types_str})>({nl}TORCH_FN({type_name}))"
+                        f"{returns_type} ({dispatcher_args_types_str})>(TORCH_FN({type_name}))"
 
                 else:
                     payload = f"torch::CppFunction::makeUnboxedOnly(&{type_name})"
@@ -336,6 +301,28 @@ def func(f: NativeFunction) -> Optional[str]:
 
     return func
 
+# Return a string with a comma separated list of expressions that could be used
+# to call this operator. This can be used to generate code that wraps operators
+# and calls back into them. The process_tensoroptions argument determines how
+# tensor options should be treated. They can be
+# - PASS_THROUGH: Don't do anything, just handle them as regular arguments
+# - SCATTER: Expect a `TensorOptions options` in the scope and scatter it into `options.dtype, ...`
+# - GATHER: Expect `dtype, ...` in the scope and gather them into a TensorOptions for calling
+def exprs_str(signature: CppSignature,
+              process_tensoroptions: dispatcher.ProcessTensoroptions = dispatcher.ProcessTensoroptions.PASS_THROUGH,
+              exclude_this: bool = False,
+              ) -> str:
+    args = signature.cpp_arguments()
+    if exclude_this:
+        args = [a for a in args if not isinstance(a.argument, ThisArgument)]
+    exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=process_tensoroptions)
+    return ', '.join(map(lambda a: a.expr, exprs))
+
+def types_str(signature: CppSignature) -> str:
+    args = signature.cpp_arguments()
+    exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=dispatcher.ProcessTensoroptions.PASS_THROUGH)
+    return ', '.join(map(lambda a: a.type, exprs))
+
 # Generates Function.cpp and Function.h.  These files provide the
 # functional public C++ API, and the scaffolding to call into
 # the dispatcher from these functions.  See also compute_tensor_method.
@@ -347,32 +334,73 @@ def go(f: NativeFunction) -> Optional[str]:
         if Variant.function not in f.variants:
             return None
 
-        name = cpp.name(f.func)
-
         cpp_returns_type = cpp.returns_type(f.func.returns)
-        cpp_args = cpp.arguments(f.func)
-        cpp_args_str = ', '.join(map(str, cpp_args))
+        cpp_name = cpp.name(f.func)
+        signature_group = cpp.signature_group(f.func, method=False)
 
         if target is Target.DECLARATION:
-            return f"CAFFE2_API {cpp_returns_type} {name}({cpp_args_str});"
+            if signature_group.gathered_signature is None:
+                # There's no TensorOptions
+                return f"""
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)});
+"""
+            else:
+                # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"),
+                # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature").
+                # The gathered_signature already exists in several older PyTorch versions and had default arguments.
+                # For backward compatibility, we left it unchanged and added the scattered API on top of it.
+                # Note that the scattered API cannot have default arguments or calls will be ambigious.
+                return f"""
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)});
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)});
+"""
 
         assert target is Target.DEFINITION
 
-        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
-        cpp_args_str_no_default = ', '.join(map(lambda a: a.str_no_default(), cpp_args))
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
-        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
-        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
 
-        return f"""
+        if signature_group.gathered_signature is None:
+            # There's no TensorOptions
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+        elif local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            # for c10-full ops, the scattered version is the real op and the gathered version is a proxy
+            # calling into the scattered version
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER)});
+}}
+"""
+        else:
+            # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy
+            # calling into the gathered version
+            return f"""
 // aten::{f.func}
-{cpp_returns_type} {name}({cpp_args_str_no_default}) {{
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
-        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
-    return op.call({dispatcher_exprs_str});
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>();
+    return op.call({exprs_str(signature_group.gathered_signature)});
+}}
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER)});
 }}
 """
+
     return go
 
 # Generates TensorBody.h (sic) and TensorMethods.cpp.  These files provide the
@@ -388,30 +416,78 @@ def go(f: NativeFunction) -> Optional[str]:
         assert len(f.func.arguments) > 0
         assert sum(a.name == 'self' for a in f.func.arguments) == 1
 
-        name = cpp.name(f.func)
+        cpp_name = cpp.name(f.func)
         cpp_returns_type = cpp.returns_type(f.func.returns)
-        cpp_args = cpp.arguments(f.func, method=True)
-        cpp_args_exclude_this = [a for a in cpp_args if not isinstance(a.argument, ThisArgument)]
-        cpp_args_exclude_this_str = ', '.join(str(a) for a in cpp_args_exclude_this)
+        signature_group = cpp.signature_group(f.func, method=True)
 
         if target is Target.DECLARATION:
-            return f"{cpp_returns_type} {name}({cpp_args_exclude_this_str}) const;"
+            if signature_group.gathered_signature is None:
+                # There's no TensorOptions. Just create the API without concern for TensorOptions.
+                return f"{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)}) const;"
+            else:
+                # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"),
+                # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature").
+                # The gathered_signature already exists in several older PyTorch versions and had default arguments.
+                # For backward compatibility, we left it unchanged and added the scattered API on top of it.
+                # Note that the scattered API cannot have default arguments or calls will be ambigious.
+                return f"""
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)}) const;
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const;
+"""
 
         assert target is Target.DEFINITION
 
-        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
-        cpp_args_exclude_this_str_no_default = ', '.join(a.str_no_default() for a in cpp_args_exclude_this)
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
-        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
-        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
 
-        return f"""
+        result = f"""
 // aten::{f.func}
-{cpp_returns_type} Tensor::{name}({cpp_args_exclude_this_str_no_default}) const {{
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
-        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
-    return op.call({dispatcher_exprs_str});
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+
+        if signature_group.gathered_signature is None:
+            # There's no TensorOptions
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+        elif local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            # for c10-full ops, the scattered version is the real op and the gathered version is a proxy
+            # calling into the scattered version
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER, exclude_this=True)});
+}}
+"""
+        else:
+            # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy
+            # calling into the gathered version
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>();
+    return op.call({exprs_str(signature_group.gathered_signature)});
+}}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER, exclude_this=True)});
 }}
 """
 
@@ -474,23 +550,35 @@ def go(f: NativeFunction) -> Optional[str]:
 
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
         dispatcher_args = dispatcher.arguments(f.func)
-        dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
+
+        args: Union[Sequence[DispatcherArgument], Sequence[LegacyDispatcherArgument]]
+        if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            returns_type = dispatcher_returns_type
+            args = dispatcher_args
+            exprs = dispatcher.exprs(dispatcher_args)
+            dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
+        else:
+            returns_type = legacy_dispatcher_returns_type
+            args = legacy_dispatcher_args
+            exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
+            dispatch_key = "options.computeDispatchKey()"
 
         if target is Target.DEFINITION:
-            # See Note [Byte-for-byte compatibility]
             # I don't think there's actually a good reason to generate
             # these two cases differently
+            # The first case could probably be improved though- it calls dispatchTypeId(),
+            # which looks at TLS dispatch keys- there should not be any by the time we reach backend select.
             if legacy_dispatcher_tensor_args:
                 tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args)
                 compute_dk = f"""\
-DispatchKeySet _dk_set = DispatchKeySet(options.computeDispatchKey()) | c10::detail::multi_dispatch_key_set({tensor_args});
+DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args});
   DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
   DispatchKey _dk = c10::impl::dispatchTypeId(_dk_set, _dk_mask);"""
             else:
-                compute_dk = "DispatchKey _dk = options.computeDispatchKey();"
+                compute_dk = f"DispatchKey _dk = {dispatch_key};"
             return f"""\
 // aten::{f.func}
-{legacy_dispatcher_returns_type} {name}({', '.join(a.str_with_default() for a in legacy_dispatcher_args)}) {{
+{returns_type} {name}({', '.join(str(a) for a in args)}) {{
   static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
     .typed<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>();
@@ -499,7 +587,7 @@ def go(f: NativeFunction) -> Optional[str]:
   // This trick allows calling Autograd backend kernel first and then backend kernel,
   // without adding another AutogradBackendSelect dispatch key.
   DispatchKey _current_dk = at::impl::variable_excluded_from_dispatch() ? _dk : _autograd_dk;
-  return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in dispatcher_exprs)});
+  return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in exprs)});
 }}
 """
         elif target is Target.REGISTRATION:
@@ -638,23 +726,8 @@ def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[
             name = f.func.out_arguments[i].name
         # If the return argument is explicitly named...
         elif r.name:
-            # See Note [Byte-for-byte compatibility]
-            #
-            # Check if it would conflict with an existing argument.
-            # Downstream codegen assumes that return names and argument
-            # names don't conflict with each other, so we disambiguate
-            # (by adding a trailing _return) this case.  Notice that
-            # historically, the collision check was buggy: it just did a
-            # straight string contains test on the entirety of the
-            # inputs part of the format string, meaning that it also
-            # picked up occurrences of the argument name in the NAME of
-            # the function, as well as substring occurrences of the name
-            # in arguments.  We have simulated the old logic here...
-            buggy_name_conflict = r.name in str(f.func.name) or \
-                any(r.name in a.name for a in f.func.schema_order_arguments())
-            # ... but a more correct version is simply
-            # name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
-            if buggy_name_conflict and not f.func.is_out_fn():
+            name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
+            if name_conflict and not f.func.is_out_fn():
                 name = f'{r.name}_return'
             else:
                 name = r.name
@@ -715,20 +788,9 @@ def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Se
         arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type))
     if a.name in kwarg_only_set:
         arg['kwarg_only'] = True
-    # See Note [Byte-for-byte compatibility]
-    # The default value of kwarg_only is False; this case exists for
-    # byte-for-byte compatibility
-    elif a.name in out_arg_set:
-        arg['kwarg_only'] = False
     if a.name in out_arg_set:
         arg['output'] = True
-        # See Note [Byte-for-byte compatibility]
-        # This is probably a bug in the original implementation, where
-        # the specification of allocate was not properly propagated to
-        # the schema-order arguments.  In any case, this field
-        # is redundant with the output field
-        if not schema_order:
-            arg['allocate'] = True
+        arg['allocate'] = True
         # See Note [name and field_name]
         if a.name in name_to_field_name:
             arg['field_name'] = name_to_field_name[a.name]
@@ -748,7 +810,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
     kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments)
     out_arg_set = set(a.name for a in f.func.out_arguments)
 
-    cpp_args = cpp.arguments(f.func)
+    signature_group = cpp.signature_group(f.func)
+    cpp_args = signature_group.signature_prefer_gathered().cpp_arguments()
     arguments = [
         compute_cpp_argument_yaml(
             cpp_a, schema_order=False,
@@ -756,9 +819,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         for cpp_a in cpp_args
     ]
 
-    # See Note [Byte-for-byte compatibility]
-    # NB: NOT actually schema order.  This is almost certainly a BUG.
-    schema_order_jit_arguments = list(itertools.chain(f.func.arguments, f.func.out_arguments, f.func.kwarg_only_arguments))
+    schema_order_jit_arguments = list(f.func.schema_order_arguments())
 
     schema_order_arguments = [
         compute_argument_yaml(
@@ -811,8 +872,20 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('device_guard', f.device_guard),
         ('with_gil', False),
         ('deprecated', False),
+        ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch),
     ])
 
+@with_native_function
+def compute_registration_declarations(f: NativeFunction) -> str:
+    name = dispatcher.name(f.func)
+    returns_type = dispatcher.returns_type(f.func.returns)
+    args = dispatcher.arguments(f.func)
+    args_str = ', '.join(map(str, args))
+    dispatch = f.dispatch is not None
+    math = dispatch and 'Math' in f.dispatch  # type: ignore
+    return f"""{returns_type} {name}({args_str}); // {{"schema": "aten::{f.func}", "dispatch": "{dispatch}", "math": "{math}"}}
+"""
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                           RUN IT ALL
@@ -913,11 +986,6 @@ def main() -> None:
         nargs='*',
         help='filter dispatch backend by the whitelist (if set), '
              'e.g.: CPU CUDA QuantizedCPU ...')
-    parser.add_argument(
-        '--per_op_registration',
-        action='store_true',
-        help='group function registrations by op name and write to separate files; '
-             'must also set --op_registration_whitelist param')
     parser.add_argument(
         '--force_schema_registration',
         action='store_true',
@@ -1010,23 +1078,36 @@ def make_file_manager(install_dir: str) -> FileManager:
             'function_registrations': list(mapMaybe(
                 compute_type_method(
                     dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
-                native_functions
-            )) if not options.per_op_registration else [],
+                native_functions)),
         })
         del fm
 
     cpu_fm.write('TypeDefault.h', lambda: {
-        'type_method_declarations': list(mapMaybe(
+        'type_method_declarations':
+        list(mapMaybe(
             compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) +
+        list(mapMaybe(
+            compute_type_method('Math', target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
             native_functions)),
+
     })
     cpu_fm.write('TypeDefault.cpp', lambda: {
-        'type_method_definitions': list(mapMaybe(
+        'type_method_definitions':
+        list(mapMaybe(
             compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) +
+        list(mapMaybe(
+            compute_type_method('Math', target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
             native_functions)),
+
         'function_registrations': list(mapMaybe(
             compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
-            native_functions)) if not options.per_op_registration else [],
+            native_functions)),
+
+        'math_function_registrations': list(mapMaybe(
+            compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)),
     })
     cpu_fm.write('Functions.h', lambda: {
         'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)),
@@ -1058,53 +1139,15 @@ def computeSchemaRegister() -> Dict[str, object]:
             schema_registrations = list(mapMaybe(
                 compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=None, def_only=True),
                 native_functions))
-            # See Note [Byte-for-byte compatibility]
-            schema_registrations.sort()
             return {
                 'schema_registrations': schema_registrations,
             }
         cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister)
 
-    if options.per_op_registration:
-        def gen_per_op_registration_filename(opname: str) -> str:
-            return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-'))
-
-        if op_registration_whitelist is None:
-            raise Exception("Must set --op_registration_whitelist for per-op registration.")
-
-        # First, group all native functions by unoverloaded operator name
-        grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list)
-        for f in native_functions:
-            grouped_functions[f"aten::{f.func.name.name}"].append(f)
-        extra_headers = []
-        for b in backends:
-            extra_headers.append(f'#include <ATen/{b}Type.h>')
-
-        # Next, generate registration for each one
-        for name in op_registration_whitelist:
-            def computePerOpRegistration() -> Dict[str, object]:
-                fs = grouped_functions[name]
-                registrations: List[str] = []
-                for mb_dispatch in itertools.chain([None], backends):
-                    # or you could pass in op_registration_whitelist, it doesn't
-                    # matter!
-                    # NB: Use of compute_type_method here is kind of an abuse;
-                    # this is why we have to unconditionally write in
-                    # torch::dispatch in the registration when it should be
-                    # contextually clear
-                    registrations.extend(
-                        mapMaybe(
-                            compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None),
-                            fs))
-                return {
-                    'extra_headers': extra_headers,
-                    'function_registrations': registrations,
-                }
-
-            cpu_fm.write_with_template(
-                gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration)
-
     cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions))))
+    cpu_fm.write('RegistrationDeclarations.h', lambda: {
+        'registration_declarations': list(map(compute_registration_declarations, native_functions)),
+    })
 
     if options.output_dependencies:
         cpu_fm.write_outputs(options.output_dependencies)
diff --git a/tools/codegen/local.py b/tools/codegen/local.py
index 9244cb181aec..41deef4884f0 100644
--- a/tools/codegen/local.py
+++ b/tools/codegen/local.py
@@ -18,7 +18,6 @@
 
 class Locals(threading.local):
     use_c10_dispatcher: Optional[UseC10Dispatcher] = None
-    hack_const_mutable_self: bool = False
 _locals = Locals()
 
 # The use_c10_dispatcher field in native_functions.yaml is used to
@@ -31,19 +30,11 @@ def use_c10_dispatcher() -> UseC10Dispatcher:
         "need to initialize local.use_c10_dispatcher with local.parametrize"
     return _locals.use_c10_dispatcher
 
-# This is used to maintain compat, see Note [Byte-for-byte compatibility]
-# It can be removed when we drop compat.
-def hack_const_mutable_self() -> bool:
-    return _locals.hack_const_mutable_self
-
 @contextmanager
-def parametrize(*, use_c10_dispatcher: UseC10Dispatcher, hack_const_mutable_self: bool) -> Iterator[None]:
+def parametrize(*, use_c10_dispatcher: UseC10Dispatcher) -> Iterator[None]:
     old_use_c10_dispatcher = _locals.use_c10_dispatcher
-    old_hack_const_mutable_self = _locals.hack_const_mutable_self
     try:
         _locals.use_c10_dispatcher = use_c10_dispatcher
-        _locals.hack_const_mutable_self = hack_const_mutable_self
         yield
     finally:
         _locals.use_c10_dispatcher = old_use_c10_dispatcher
-        _locals.hack_const_mutable_self = old_hack_const_mutable_self
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index b0c470c91b6a..7dd1f6ff505c 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -1,7 +1,7 @@
 import re
 
 from dataclasses import dataclass
-from typing import List, Sequence, Dict, Optional, Iterator, Tuple, Set, NoReturn
+from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn
 from enum import Enum
 import itertools
 
@@ -197,6 +197,8 @@ def __post_init__(self) -> None:
                 "otherwise you will tickle a Python argument binding bug " \
                 "(which usually manifests itself as the result variable being undefined.)"
 
+SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
+
 # The function schema is undoubtedly the most important data structure
 # in all of the codegen, as it defines the type signature for operators,
 # and most of the code generation we do is type directed (e.g., look at
@@ -255,18 +257,17 @@ class FunctionSchema:
     # The name of the operator this function schema describes.
     name: 'OperatorName'
 
-    # NB: Sequence here is intentional, to make it read only
-    arguments: Sequence['Argument']
-    kwarg_only_arguments: Sequence['Argument']  # but not including out args
+    arguments: Tuple['Argument', ...]
+    kwarg_only_arguments: Tuple['Argument', ...]  # but not including out args
     # Unlike in the previous codegen, we have factored out 'out' arguments
     # in the canonical representation, removing them from kwarg
     # arguments.  This choice is justified by numerous downstream
     # transformations which treat out arguments specially; additionally,
     # you can see that canonicity is not violated!
-    out_arguments: Sequence['Argument']  # these are also kwarg-only
+    out_arguments: Tuple['Argument', ...]  # these are also kwarg-only
 
     # TODO: Need to handle collisions with argument names at some point
-    returns: Sequence['Return']
+    returns: Tuple['Return', ...]
 
     def schema_order_arguments(self) -> Iterator['Argument']:
         return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments)
@@ -303,7 +304,11 @@ def __post_init__(self) -> None:
         if self.name.name.inplace:
             # TODO: fixme
             if str(self.name) not in [
-                    '_amp_non_finite_check_and_unscale_',
+                    '_amp_foreach_non_finite_check_and_unscale_',
+                    '_foreach_add_scalar_list_',
+                    '_foreach_sub_scalar_list_',
+                    '_foreach_mul_scalar_list_',
+                    '_foreach_div_scalar_list_',
                     '_foreach_add_.Scalar',
                     '_foreach_sub_.Scalar',
                     '_foreach_mul_.Scalar',
@@ -347,6 +352,76 @@ def is_out_fn(self) -> bool:
         #     we only do this check in tools/
         return bool(self.out_arguments)
 
+    def kind(self) -> SchemaKind:
+        """
+        What kind of schema is this?  A functional schema is one
+        that returns a newly allocated output; an inplace schema
+        modifies the self argument inplace; an out schema writes
+        the result into an explicitly provided out argument.
+        """
+        is_inplace = self.name.name.inplace
+        is_out = bool(self.out_arguments)
+        assert not (is_inplace and is_out)
+        if is_inplace:
+            return SchemaKind.inplace
+        elif is_out:
+            return SchemaKind.out
+        else:
+            return SchemaKind.functional
+
+    # WARNING: This method is not currently tested in any meaningful way
+    def signature(self) -> 'FunctionSchema':
+        """
+        Certain schemas are 'related', in that they are simply
+        inplace/out/functional versions of the same function.  This method
+        factors these schemas into the "core" functional signature which
+        is equal across all versions.
+
+        Here is what normalization happens to the schema to convert
+        it to a signature:
+        - The overload name is stripped (name is retained, since
+          it expresses semantic content about what the function does)
+        - Inplace is set False
+        - Out arguments are stripped
+        - Mutability annotations are stripped  (this is sound
+          because you cannot overload on mutability annotation)
+
+        This function is based off of get_signature in
+        tools.autograd.load_derivatives
+        """
+
+        # dataclasses.replace could be used here, but it is less
+        # type safe so for now I've opted to type everything out
+        def strip_arg_annotation(a: Argument) -> Argument:
+            return Argument(
+                name=a.name,
+                type=a.type,
+                default=a.default,  # hmmm
+                annotation=None,
+            )
+
+        def strip_ret_annotation(r: Return) -> Return:
+            return Return(
+                name=r.name,
+                type=r.type,
+                annotation=None,
+            )
+
+        return FunctionSchema(
+            name=OperatorName(
+                name=BaseOperatorName(
+                    base=self.name.name.base,
+                    inplace=False,
+                    dunder_method=self.name.name.dunder_method,
+                ),
+                overload_name="",  # stripped
+            ),
+            arguments=tuple(map(strip_arg_annotation, self.arguments)),
+            kwarg_only_arguments=tuple(map(strip_arg_annotation, self.kwarg_only_arguments)),
+            out_arguments=(),  # stripped
+            returns=tuple(map(strip_ret_annotation, self.returns)),
+        )
+
     def __str__(self) -> str:
         all_arguments: List[str] = []
         all_arguments.extend(map(str, self.arguments))
@@ -372,14 +447,14 @@ def __str__(self) -> str:
 class Annotation:
     # Typically only has one element.  Not actually a set so
     # we can conveniently assume it is canonically ordered
-    alias_set: Sequence[str]
+    alias_set: Tuple[str, ...]
     is_write: bool
 
     @staticmethod
     def parse(ann: str) -> 'Annotation':
         m = re.match(r'^([a-z])(!?)$', ann)
         assert m is not None, f'unrecognized alias annotation {ann}'
-        alias_set = [m.group(1)]
+        alias_set = (m.group(1),)
         is_write = m.group(2) == '!'
         r = Annotation(alias_set=alias_set, is_write=is_write)
         assert str(r) == ann, f'{r} != {ann}'
@@ -725,21 +800,18 @@ def __str__(self) -> str:
 
 # Helper functions for parsing argument lists (both inputs and returns)
 
-def parse_returns(return_decl: str) -> Sequence[Return]:
+def parse_returns(return_decl: str) -> Tuple[Return, ...]:
     """
     Input: '()'
     Output: []
     """
     if return_decl == '()':
-        return []
+        return ()
     if return_decl[0] == '(' and return_decl[-1] == ')':
         return_decl = return_decl[1:-1]
-    returns = []
-    for arg in return_decl.split(', '):
-        returns.append(Return.parse(arg))
-    return returns
+    return tuple(Return.parse(arg) for arg in return_decl.split(', '))
 
-def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], Sequence[Argument]]:
+def parse_arguments(args: str) -> Tuple[Tuple[Argument, ...], Tuple[Argument, ...], Tuple[Argument, ...]]:
     """
     Input: 'int x, int y, int z'
     Output: positional args, kwarg only args
@@ -774,4 +846,4 @@ def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument],
             assert arguments_acc is not out_arguments
         arguments_acc.append(parg)
 
-    return arguments, kwarg_only_arguments, out_arguments
+    return tuple(arguments), tuple(kwarg_only_arguments), tuple(out_arguments)
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 4b91abf1c6c7..576a0b39f501 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -1,4 +1,4 @@
-from __future__ import print_function
+
 import os
 import collections
 from pprint import pformat
@@ -74,11 +74,7 @@
     # Somehow, these are defined in both _C and in functional. Ick!
     'broadcast_tensors',
     # Manually define named tensor type stubs in __init__.pyi.in
-    'rename',
-    'refine_names',
-    'align_to',
     'align_tensors',
-    'unflatten',
     'meshgrid',
     'cartesian_prod',
     'block_diag',
@@ -87,7 +83,6 @@
     'stft',
     'istft',
     'tensordot',
-    'norm',
     'split',
     'unique_consecutive',
     'atleast_1d',
@@ -151,6 +146,7 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
+        'ArrayRef<double>' : 'Sequence[float]'
     }[typename]
 
     return typename
@@ -404,6 +400,14 @@ def gen_nn_functional(out):
     }
     write(out, 'torch/nn/functional.pyi', stubs, env)
 
+    # functional.pyi already contains the definitions for those functions
+    # so, we don't export then to it
+    from_c.extend(['hardtanh', 'leaky_relu', 'hardsigmoid'])
+    dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)]
+    env = {
+        'imported_hints': import_code,
+        'dispatched_hints': dispatch_code
+    }
     stubs = CodeTemplate.from_file(os.path.join('torch', '_C', '_nn.pyi.in'))
     write(out, 'torch/_C/_nn.pyi', stubs, env)
 
@@ -470,10 +474,12 @@ def gen_pyi(declarations_path, out):
                     ' generator: Optional[Generator]=None, {}) -> Tensor: ...'
                     .format(FACTORY_PARAMS)],
         'full': ['def full(size: _size, fill_value: Number, *,'
-                 ' out: Optional[Tensor]=None, {}) -> Tensor: ...'
+                 ' out: Optional[Tensor]=None,'
+                 ' layout: _layout=strided, {}) -> Tensor: ...'
                  .format(FACTORY_PARAMS),
                  'def full(size: _size, fill_value: Number, *,'
-                 ' names: List[Union[str, None]], {}) -> Tensor: ...'
+                 ' names: List[Union[str, None]],'
+                 ' layout: _layout=strided, {}) -> Tensor: ...'
                  .format(FACTORY_PARAMS)],
         'is_grad_enabled': ['def is_grad_enabled() -> _bool: ...'],
         'nonzero': ['def nonzero(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ...',
@@ -536,6 +542,7 @@ def gen_pyi(declarations_path, out):
                      'def __init__(self, other: Tensor) -> None: ...',
                      'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM),
                      ],
+        'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."],
         # clamp has no default values in the Declarations
         'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf,"
                   " *, out: Optional[Tensor]=None) -> Tensor: ..."],
@@ -546,6 +553,7 @@ def gen_pyi(declarations_path, out):
         'tolist': ['def tolist(self) -> List: ...'],
         'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'],
         'element_size': ['def element_size(self) -> _int: ...'],
+        'data_ptr': ['def data_ptr(self) -> _int: ...'],
         'dim': ['def dim(self) -> _int: ...'],
         'nonzero': ['def nonzero(self, *, as_tuple: _bool=...) -> Tensor: ...'],
         'numel': ['def numel(self) -> _int: ...'],
@@ -576,6 +584,10 @@ def gen_pyi(declarations_path, out):
                ],
         'item': ["def item(self) -> Number: ..."],
         'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."],
+        'set_': ['def set_(self, storage: Storage, offset: _int, size: _size, stride: _size) -> Tensor: ...',
+                 'def set_(self, storage: Storage) -> Tensor: ...'],
+        'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
+                  'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
     })
     for binop in ['mul', 'div', 'true_divide', 'floor_divide']:
         for inplace in [False, True]:
@@ -632,7 +644,7 @@ def gen_pyi(declarations_path, out):
     for c in ('Double', 'Float', 'Long', 'Int',
               'Short', 'Char', 'Byte', 'Bool',
               'Half', 'BFloat16', 'ComplexDouble',
-              'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32'):
+              'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32', 'QUInt4x2'):
         legacy_storage_base_hints.append('class {}StorageBase(object): ...'.format(c))
 
     legacy_class_hints = []
@@ -650,7 +662,7 @@ def gen_pyi(declarations_path, out):
                          ['float32', 'float', 'float64', 'double', 'float16', 'bfloat16', 'half',
                           'uint8', 'int8', 'int16', 'short', 'int32', 'int', 'int64', 'long',
                           'complex32', 'complex64', 'cfloat', 'complex128', 'cdouble',
-                          'quint8', 'qint8', 'qint32', 'bool']]
+                          'quint8', 'qint8', 'qint32', 'bool', 'quint4x2']]
 
     # Generate __all__ directive
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index d5db749d1552..abbfb6e7a65f 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -1,6 +1,6 @@
 "Manages CMake."
 
-from __future__ import print_function
+
 
 import multiprocessing
 import os
@@ -245,6 +245,7 @@ def generate(self, version, cmake_python_library, build_python, build_test, my_e
              'MKL_THREADING',
              'MKLDNN_CPU_RUNTIME',
              'MSVC_Z7_OVERRIDE',
+             'CAFFE2_USE_MSVC_STATIC_RUNTIME',
              'Numa_INCLUDE_DIR',
              'Numa_LIBRARIES',
              'ONNX_ML',
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index b78dc4a362a7..f64025c34683 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -46,7 +46,7 @@ append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS)
 
 # NB: This has to match the condition under which the JIT test directory
 #     is included (at the time of writing that's in caffe2/CMakeLists.txt).
-if(BUILD_TEST AND NOT USE_ROCM)
+if(BUILD_TEST)
     add_definitions(-DBUILDING_TESTS)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_ROOT}/test/cpp/jit/torch_python_test.cpp
@@ -66,6 +66,9 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${CMAKE_BINARY_DIR}/third_party
     ${CMAKE_BINARY_DIR}/third_party/onnx
 
+    ${TORCH_ROOT}/third_party/valgrind/callgrind
+    ${TORCH_ROOT}/third_party/valgrind/include
+
     ${TORCH_ROOT}/third_party/gloo
     ${TORCH_ROOT}/third_party/onnx
     ${pybind11_INCLUDE_DIRS}
@@ -160,25 +163,28 @@ endif()
 
 if(USE_DISTRIBUTED)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
-    if(NOT MSVC)
+    if(WIN32)
+      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+    else()
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC)
       append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-      # Disable certain warnings for GCC-9.X
-      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      endif()
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
-      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if(USE_TENSORPIPE)
-        list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
-        list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
-      endif()
     endif()
+    # Disable certain warnings for GCC-9.X
+    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+    endif()
+    if(USE_TENSORPIPE)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
+      list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
+    endif()
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 endif()
 
-if(USE_NCCL)
+if(USE_NCCL AND NOT WIN32)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 0d48ea710fdd..9ccc5f7cb899 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -28,7 +28,7 @@ class device:
 
     # THPDevice_pynew
     @overload
-    def __init__(self, device: Union[_int, str]) -> None: ...
+    def __init__(self, device: Union[_device, _int, str]) -> None: ...
 
     @overload
     def __init__(self, type: str, index: _int) -> None: ...
@@ -87,6 +87,9 @@ ${dtype_class_hints}
 class layout:
     ...
 
+# Defined in torch/csrc/utils/disable_torch_function.cpp
+def DisableTorchFunction(): ...
+
 # Defined in torch/csrc/utils/tensor_layouts.cpp
 strided : layout = ...
 sparse_coo : layout = ...
@@ -103,8 +106,12 @@ preserve_format: memory_format = ...
 # Defined in torch/csrc/QScheme.cpp
 class qscheme: ...
 
-# Defined in torch/csrc/utils/tensor_qschemes.cpp
+# Defined in torch/csrc/utils/tensor_qschemes.h
 per_tensor_affine: qscheme = ...
+per_channel_affine: qscheme = ...
+per_tensor_symmetric: qscheme = ...
+per_channel_symmetric: qscheme = ...
+per_channel_affine_float_qparams: qscheme = ...
 
 # Defined in torch/csrc/autograd/python_function.cpp
 class _FunctionBase(object):
@@ -132,6 +139,8 @@ class Future(object):
   def then(self, callback: Callable) -> Future: ...
   def set_result(self, result: Any) -> None: ...
 
+def _jit_set_num_profiled_runs(num: _size) -> _size: ...
+
 # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h
 class MobileOptimizerType:
     ...
@@ -170,6 +179,20 @@ def _jit_set_texpr_fuser_enabled(enable: _bool): ...
 def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ...
 def _jit_pass_canonicalize(graph: Graph): ...
 def _jit_pass_erase_shape_information(graph: Graph): ...
+def _jit_pass_fold_convbn(module: 'torch.jit.ScriptModule'): ...
+def _jit_pass_insert_observers(module: 'torch.jit.ScriptModule',
+                               method_name: str,
+                               qconfig_dict: Dict[str, Any],
+                               inplace: _bool,
+                               quant_type: _int): ...
+def _jit_pass_insert_quant_dequant(module: 'torch.jit.ScriptModule',
+                                   method_name: str,
+                                   inplace: _bool,
+                                   debug: _bool,
+                                   quant_type: _int): ...
+def _jit_pass_quant_finalize(module: 'torch.jit.ScriptModule',
+                             quant_type: _int,
+                             preserved_attrs: Sequence[str]): ...
 def _jit_set_profiling_executor(profiling_flag: _bool) -> _bool: ...
 def _jit_set_profiling_mode(profiling_flag: _bool) -> _bool: ...
 def _jit_try_infer_type(obj: Any) -> JitType: ...
@@ -224,6 +247,7 @@ def _jit_script_compile(
 def _jit_script_class_compile(
     qual_name: str,
     definition: ClassDef,
+    defaults: Dict[str, Dict[str, Any]],
     rcb: ResolutionCallback,
 ): ...
 def _parse_source_def(src: str) -> Def: ...
@@ -364,6 +388,10 @@ def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_n
 def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
 
+# Defined in `valgrind.h` and `callgrind.h` respecitively.
+def valgrind_supported_platform() -> _bool: ...  # NVALGRIND
+def valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
+
 has_openmp: _bool
 has_mkl: _bool
 has_lapack: _bool
@@ -379,8 +407,8 @@ def is_grad_enabled() -> _bool: ...
 def set_autocast_enabled(enabled: _bool) -> None: ...
 def is_autocast_enabled() -> _bool: ...
 def clear_autocast_cache() -> None: ...
-def autocast_increment_nesting() -> None: ...
-def autocast_decrement_nesting() -> None: ...
+def autocast_increment_nesting() -> _int: ...
+def autocast_decrement_nesting() -> _int: ...
 def set_anomaly_enabled(enabled: _bool) -> None: ...
 def is_anomaly_enabled() -> _bool: ...
 
@@ -489,6 +517,7 @@ class _TensorBase(object):
 def _cuda_getCurrentStream(device: _int) -> _int: ...
 def _cuda_getDefaultStream(device: _int) -> _int: ...
 def _cuda_getCurrentBlasHandle() -> _int: ...
+def _cuda_setDevice(device: _int) -> None: ...
 def _cuda_setStream(cuda_stream: _int) -> None: ...
 def _cuda_getCompiledVersion() -> _int: ...
 def _cuda_cudaHostAllocator() -> _int: ...
@@ -503,6 +532,32 @@ def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _nccl_version() -> _int: ...
 def _nccl_unique_id() -> bytes: ...
+def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
+def _nccl_reduce(input: Sequence[Tensor],
+                 output: Tensor,
+                 root: _int,
+                 op: _int,
+                 streams: Optional[Sequence[_CudaStreamBase]],
+                 comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_all_reduce(input: Sequence[Tensor],
+                     output: Sequence[Tensor],
+                     op: _int,
+                     streams: Optional[Sequence[_CudaStreamBase]],
+                     comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_broadcast(input: Sequence[Tensor],
+                    root: _int,
+                    streams: Optional[Sequence[_CudaStreamBase]],
+                    comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_all_gather(input: Sequence[Tensor],
+                     output: Sequence[Tensor],
+                     streams: Optional[Sequence[_CudaStreamBase]],
+                     comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_reduce_scatter(input: Sequence[Tensor],
+                         output: Sequence[Tensor],
+                         op: _int,
+                         streams: Optional[Sequence[_CudaStreamBase]],
+                         comms: Optional[Sequence[object]]) -> None: ...
+
 
 class _CudaDeviceProperties:
     name: str
@@ -515,6 +570,7 @@ class _CudaDeviceProperties:
 
 # Defined in torch/csrc/cuda/Stream.cpp
 class _CudaStreamBase:
+    _cdata: _int
     device: _device
     cuda_stream: _int
     priority: _int
@@ -653,6 +709,8 @@ class EnumType(JitType):
 class TensorType(JitType):
     @classmethod
     def get(cls) -> TensorType: ...
+    @classmethod
+    def getInferred(cls) -> TensorType: ...
 
 # Defined in torch/csrc/jit/python/python_tree_views.cpp
 class SourceRange:
diff --git a/torch/__init__.py b/torch/__init__.py
index 6523ab126c0d..1ca766fa77ca 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -12,6 +12,7 @@
 import os
 import sys
 import platform
+import textwrap
 import ctypes
 
 if sys.version_info < (3,):
@@ -193,6 +194,31 @@ def _load_global_deps():
 if TYPE_CHECKING:
     import torch._C as _C
 
+# Check to see if we can load C extensions, and if not provide some guidance
+# on what the problem might be.
+try:
+    # _initExtension is chosen (arbitrarily) as a sentinel.
+    from torch._C import _initExtension
+except ImportError:
+    import torch._C as _C_for_compiled_check
+
+    # The __file__ check only works for Python 3.7 and above.
+    if sys.version_info >= (3, 7) and _C_for_compiled_check.__file__ is None:
+        raise ImportError(textwrap.dedent('''
+            Failed to load PyTorch C extensions:
+                It appears that PyTorch has loaded the `torch/_C` folder
+                of the PyTorch repository rather than the C extensions which
+                are expected in the `torch._C` namespace. This can occur when
+                using the `install` workflow. e.g.
+                    $ python setup.py install && python -c "import torch"
+
+                This error can generally be solved using the `develop` workflow
+                    $ python setup.py develop && python -c "import torch"  # This should succeed
+                or by running Python from a different directory.
+            ''').strip()) from None
+    raise  # If __file__ is not None the cause is unknown, so just re-raise.
+
+
 __all__ += [name for name in dir(_C)
             if name[0] != '_' and
             not name.endswith('Base')]
@@ -300,14 +326,16 @@ def set_default_dtype(d):
     _C._set_default_dtype(d)
 
 def set_deterministic(d):
-    r""" Sets whether native PyTorch operations must use deterministic
-    algorithms. When True, operations without deterministic algorithms
-    will throw a :class:RuntimeError when called.
+    r""" Sets whether PyTorch operations must use "deterministic"
+    algorithms. That is, algorithms which, given the same input, and when
+    run on the same software and hardware, always produce the same output.
+    When True, operations will use deterministic algorithms when available,
+    and if only nondeterministic algorithms are available they will throw a
+    :class:RuntimeError when called.
 
     .. warning::
-        This feature is a beta feature, so it does not affect every
-        nondeterministic operation yet. The following operations are
-        affected by this flag.
+        This feature is in beta, and its design and implementation may change
+        in the future.
 
     The following normally-nondeterministic operations will act
     deterministically when `d=True`:
@@ -439,11 +467,13 @@ class QInt8Storage(_C.QInt8StorageBase, _StorageBase):
 class QInt32Storage(_C.QInt32StorageBase, _StorageBase):
     pass
 
+class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase):
+    pass
 
 _storage_classes = {
     DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage,
     CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage,
-    QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage
+    QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage
 }
 
 # The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings()
@@ -477,9 +507,9 @@ def manager_path():
 # is not a good way to fix this problem.  Perhaps, try to redesign VariableFunctions
 # so that this import is good enough
 if TYPE_CHECKING:
-    # Some type signatures pulled in from _VariableFunctions here clash with 
+    # Some type signatures pulled in from _VariableFunctions here clash with
     # signatures already imported. For now these clashes are ignored; see
-    # PR #43339 for details.  
+    # PR #43339 for details.
     from torch._C._VariableFunctions import *  # type: ignore
 
 for name in dir(_C._VariableFunctions):
@@ -512,6 +542,7 @@ def manager_path():
 del BFloat16StorageBase
 del ComplexDoubleStorageBase
 del ComplexFloatStorageBase
+del QUInt4x2StorageBase
 
 ################################################################################
 # Import most common subpackages
@@ -526,6 +557,7 @@ def manager_path():
 import torch.nn.intrinsic
 import torch.nn.quantized
 import torch.optim
+import torch.optim._multi_tensor
 import torch.multiprocessing
 import torch.sparse
 import torch.utils.backcompat
@@ -586,3 +618,12 @@ def compiled_with_cxx11_abi():
 # class usage. We add these lines here to preserve backward compatbility.
 quantized_lstm = torch.ops.aten.quantized_lstm
 quantized_gru = torch.ops.aten.quantized_gru
+
+from .overrides import has_torch_function, handle_torch_function
+
+def Assert(condition, message):
+    r"""A wrapper around Python's assert which is symbolically traceable.
+    """
+    if type(condition) is not torch.Tensor and has_torch_function((condition,)):
+        return handle_torch_function(Assert, (condition,), condition, message)
+    assert condition, message
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 5fa2ee639a9f..e9fb21c5e854 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -390,6 +390,15 @@ def forward(self, x):
             # exception raised
             m(torch.rand(100))
     """
+    if isinstance(fn, property):
+        prop = fn
+        setattr(prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED)  # noqa: B010
+
+        if prop.fset:
+            setattr(prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED)  # noqa: B010
+
+        return prop
+
     fn._torchscript_modifier = FunctionModifiers.UNUSED
     return fn
 
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index b0cbf45b252b..ec0ad81dced0 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -13,23 +13,343 @@
 
 __all__ = ['lobpcg']
 
+def _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U):
+    # compute F, such that F_ij = (d_j - d_i)^{-1} for i != j, F_ii = 0
+    F = D.unsqueeze(-2) - D.unsqueeze(-1)
+    F.diagonal(dim1=-2, dim2=-1).fill_(float('inf'))
+    F.pow_(-1)
+
+    # A.grad = U (D.grad + (U^T U.grad * F)) U^T
+    Ut = U.transpose(-1, -2).contiguous()
+    res = torch.matmul(
+        U,
+        torch.matmul(
+            torch.diag_embed(D_grad) + torch.matmul(Ut, U_grad) * F,
+            Ut
+        )
+    )
+
+    return res
+
+
+def _polynomial_coefficients_given_roots(roots):
+    """
+    Given the `roots` of a polynomial, find the polynomial's coefficients.
+
+    If roots = (r_1, ..., r_n), then the method returns
+    coefficients (a_0, a_1, ..., a_n (== 1)) so that
+    p(x) = (x - r_1) * ... * (x - r_n)
+         = x^n + a_{n-1} * x^{n-1} + ... a_1 * x_1 + a_0
+
+    Note: for better performance requires writing a low-level kernel
+    """
+    poly_order = roots.shape[-1]
+    poly_coeffs_shape = list(roots.shape)
+    # we assume p(x) = x^n + a_{n-1} * x^{n-1} + ... + a_1 * x + a_0,
+    # so poly_coeffs = {a_0, ..., a_n, a_{n+1}(== 1)},
+    # but we insert one extra coefficient to enable better vectorization below
+    poly_coeffs_shape[-1] += 2
+    poly_coeffs = roots.new_zeros(poly_coeffs_shape)
+    poly_coeffs[..., 0] = 1
+    poly_coeffs[..., -1] = 1
+
+    # perform the Horner's rule
+    for i in range(1, poly_order + 1):
+        # note that it is computationally hard to compute backward for this method,
+        # because then given the coefficients it would require finding the roots and/or
+        # calculating the sensitivity based on the Vieta's theorem.
+        # So the code below tries to circumvent the explicit root finding by series
+        # of operations on memory copies imitating the Horner's method.
+        # The memory copies are required to construct nodes in the computational graph
+        # by exploting the explicit (not in-place, separate node for each step)
+        # recursion of the Horner's method.
+        # Needs more memory, O(... * k^2), but with only O(... * k^2) complexity.
+        poly_coeffs_new = poly_coeffs.clone() if roots.requires_grad else poly_coeffs
+        out = poly_coeffs_new.narrow(-1, poly_order - i, i + 1)
+        out -= roots.narrow(-1, i - 1, 1) * poly_coeffs.narrow(-1, poly_order - i + 1, i + 1)
+        poly_coeffs = poly_coeffs_new
+
+    return poly_coeffs.narrow(-1, 1, poly_order + 1)
+
+
+def _polynomial_value(poly, x, zero_power, transition):
+    """
+    A generic method for computing poly(x) using the Horner's rule.
 
-def lobpcg(A,                   # type: Tensor
-           k=None,              # type: Optional[int]
-           B=None,              # type: Optional[Tensor]
-           X=None,              # type: Optional[Tensor]
-           n=None,              # type: Optional[int]
-           iK=None,             # type: Optional[Tensor]
-           niter=None,          # type: Optional[int]
-           tol=None,            # type: Optional[float]
-           largest=None,        # type: Optional[bool]
-           method=None,         # type: Optional[str]
-           tracker=None,        # type: Optional[None]
-           ortho_iparams=None,  # type: Optional[Dict[str, int]]
-           ortho_fparams=None,  # type: Optional[Dict[str, float]]
-           ortho_bparams=None,  # type: Optional[Dict[str, bool]]
-           ):
-    # type: (...) -> Tuple[Tensor, Tensor]
+    Arguments:
+      poly (Tensor): the (possibly batched) 1D Tensor representing
+                     polynomial coefficients such that
+                     poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and
+                     poly(x) = poly[..., 0] * zero_power + ... + poly[..., n] * x^n
+
+      x (Tensor): the value (possible batched) to evalate the polynomial `poly` at.
+
+      zero_power (Tensor): the represenation of `x^0`. It is application-specific.
+
+      transition (Callable): the function that accepts some intermediate result `int_val`,
+                             the `x` and a specific polynomial coefficient
+                             `poly[..., k]` for some iteration `k`.
+                             It basically performs one iteration of the Horner's rule
+                             defined as `x * int_val + poly[..., k] * zero_power`.
+                             Note that `zero_power` is not a parameter,
+                             because the step `+ poly[..., k] * zero_power` depends on `x`,
+                             whether it is a vector, a matrix, or something else, so this
+                             functionality is delegated to the user.
+    """
+
+    res = zero_power.clone()
+    for k in range(poly.size(-1) - 2, -1, -1):
+        res = transition(res, x, poly[..., k])
+    return res
+
+def _matrix_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) matrix input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # matrix-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = x.matmul(curr_poly_val)
+        res.diagonal(dim1=-2, dim2=-1).add_(poly_coeff.unsqueeze(-1))
+        return res
+
+    if zero_power is None:
+        zero_power = torch.eye(x.size(-1), x.size(-1), dtype=x.dtype, device=x.device) \
+            .view(*([1] * len(list(x.shape[:-2]))), x.size(-1), x.size(-1))
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+def _vector_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) vector input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # vector-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = torch.addcmul(poly_coeff.unsqueeze(-1), x, curr_poly_val)
+        return res
+
+    if zero_power is None:
+        zero_power = x.new_ones(1).expand(x.shape)
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+def _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest):
+    # compute a projection operator onto an orthogonal subspace spanned by the
+    # columns of U defined as (I - UU^T)
+    Ut = U.transpose(-2, -1).contiguous()
+    proj_U_ortho = -U.matmul(Ut)
+    proj_U_ortho.diagonal(dim1=-2, dim2=-1).add_(1)
+
+    # compute U_ortho, a basis for the orthogonal complement to the span(U),
+    # by projecting a random [..., m, m - k] matrix onto the subspace spanned
+    # by the columns of U.
+    #
+    # fix generator for determinism
+    gen = torch.Generator(A.device)
+
+    # orthogonal complement to the span(U)
+    U_ortho = proj_U_ortho.matmul(
+        torch.randn(
+            (*A.shape[:-1], A.size(-1) - D.size(-1)),
+            dtype=A.dtype,
+            device=A.device,
+            generator=gen
+        )
+    )
+    U_ortho_t = U_ortho.transpose(-2, -1).contiguous()
+
+    # compute the coefficients of the characteristic polynomial of the tensor D.
+    # Note that D is diagonal, so the diagonal elements are exactly the roots
+    # of the characteristic polynomial.
+    chr_poly_D = _polynomial_coefficients_given_roots(D)
+
+    # the code belows finds the explicit solution to the Sylvester equation
+    # U_ortho^T A U_ortho dX - dX D = -U_ortho^T A U
+    # and incorporates it into the whole gradient stored in the `res` variable.
+    #
+    # Equivalent to the following naive implementation:
+    # res = A.new_zeros(A.shape)
+    # p_res = A.new_zeros(*A.shape[:-1], D.size(-1))
+    # for k in range(1, chr_poly_D.size(-1)):
+    #     p_res.zero_()
+    #     for i in range(0, k):
+    #         p_res += (A.matrix_power(k - 1 - i) @ U_grad) * D.pow(i).unsqueeze(-2)
+    #     res -= chr_poly_D[k] * (U_ortho @ poly_D_at_A.inverse() @ U_ortho_t @  p_res @ U.t())
+    #
+    # Note that dX is a differential, so the gradient contribution comes from the backward sensitivity
+    # Tr(f(U_grad, D_grad, A, U, D)^T dX) = Tr(g(U_grad, A, U, D)^T dA) for some functions f and g,
+    # and we need to compute g(U_grad, A, U, D)
+    #
+    # The naive implementation is based on the paper
+    # Hu, Qingxi, and Daizhan Cheng.
+    # "The polynomial solution to the Sylvester matrix equation."
+    # Applied mathematics letters 19.9 (2006): 859-864.
+    #
+    # We can modify the computation of `p_res` from above in a more efficient way
+    # p_res =   U_grad * (chr_poly_D[1] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k)).unsqueeze(-2)
+    #       + A U_grad * (chr_poly_D[2] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k - 1)).unsqueeze(-2)
+    #       + ...
+    #       + A.matrix_power(k - 1) U_grad * chr_poly_D[k]
+    # Note that this saves us from redundant matrix products with A (elimination of matrix_power)
+    U_grad_projected = U_grad
+    series_acc = U_grad_projected.new_zeros(U_grad_projected.shape)
+    for k in range(1, chr_poly_D.size(-1)):
+        poly_D = _vector_polynomial_value(chr_poly_D[..., k:], D)
+        series_acc += U_grad_projected * poly_D.unsqueeze(-2)
+        U_grad_projected = A.matmul(U_grad_projected)
+
+    # compute chr_poly_D(A) which essentially is:
+    #
+    # chr_poly_D_at_A = A.new_zeros(A.shape)
+    # for k in range(chr_poly_D.size(-1)):
+    #     chr_poly_D_at_A += chr_poly_D[k] * A.matrix_power(k)
+    #
+    # Note, however, for better performance we use the Horner's rule
+    chr_poly_D_at_A = _matrix_polynomial_value(chr_poly_D, A)
+
+    # compute the action of `chr_poly_D_at_A` restricted to U_ortho_t
+    chr_poly_D_at_A_to_U_ortho = torch.matmul(
+        U_ortho_t,
+        torch.matmul(
+            chr_poly_D_at_A,
+            U_ortho
+        )
+    )
+    # we need to invert 'chr_poly_D_at_A_to_U_ortho`, for that we compute its
+    # Cholesky decomposition and then use `torch.cholesky_solve` for better stability.
+    # Cholesky decomposition requires the input to be positive-definite.
+    # Note that `chr_poly_D_at_A_to_U_ortho` is positive-definite if
+    # 1. `largest` == False, or
+    # 2. `largest` == True and `k` is even
+    # under the assumption that `A` has distinct eigenvalues.
+    #
+    # check if `chr_poly_D_at_A_to_U_ortho` is positive-definite or negative-definite
+    chr_poly_D_at_A_to_U_ortho_sign = -1 if (largest and (k % 2 == 1)) else +1
+    chr_poly_D_at_A_to_U_ortho_L = torch.cholesky(
+        chr_poly_D_at_A_to_U_ortho_sign * chr_poly_D_at_A_to_U_ortho
+    )
+
+    # compute the gradient part in span(U)
+    res = _symeig_backward_complete_eigenspace(
+        D_grad, U_grad, A, D, U
+    )
+
+    # incorporate the Sylvester equation solution into the full gradient
+    # it resides in span(U_ortho)
+    res -= U_ortho.matmul(
+        chr_poly_D_at_A_to_U_ortho_sign * torch.cholesky_solve(
+            U_ortho_t.matmul(series_acc),
+            chr_poly_D_at_A_to_U_ortho_L
+        )
+    ).matmul(Ut)
+
+    return res
+
+def _symeig_backward(D_grad, U_grad, A, D, U, largest):
+    # if `U` is square, then the columns of `U` is a complete eigenspace
+    if U.size(-1) == U.size(-2):
+        return _symeig_backward_complete_eigenspace(
+            D_grad, U_grad, A, D, U
+        )
+    else:
+        return _symeig_backward_partial_eigenspace(
+            D_grad, U_grad, A, D, U, largest
+        )
+
+class LOBPCGAutogradFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx,
+                A: Tensor,
+                k: Optional[int] = None,
+                B: Optional[Tensor] = None,
+                X: Optional[Tensor] = None,
+                n: Optional[int] = None,
+                iK: Optional[Tensor] = None,
+                niter: Optional[int] = None,
+                tol: Optional[float] = None,
+                largest: Optional[bool] = None,
+                method: Optional[str] = None,
+                tracker: Optional[None] = None,
+                ortho_iparams: Optional[Dict[str, int]] = None,
+                ortho_fparams: Optional[Dict[str, float]] = None,
+                ortho_bparams: Optional[Dict[str, bool]] = None
+                ) -> Tuple[Tensor, Tensor]:
+
+        # makes sure that input is contiguous for efficiency.
+        # Note: autograd does not support dense gradients for sparse input yet.
+        A = A.contiguous() if (not A.is_sparse) else A
+        if B is not None:
+            B = B.contiguous() if (not B.is_sparse) else B
+
+        D, U = _lobpcg(
+            A, k, B, X,
+            n, iK, niter, tol, largest, method, tracker,
+            ortho_iparams, ortho_fparams, ortho_bparams
+        )
+
+        ctx.save_for_backward(A, B, D, U, largest)
+
+        return D, U
+
+    @staticmethod
+    def backward(ctx, D_grad, U_grad):
+        A_grad = B_grad = None
+        grads = [None] * 14
+
+        A, B, D, U, largest = ctx.saved_tensors
+
+        # lobpcg.backward has some limitations. Checks for unsupported input
+        if A.is_sparse or (B is not None and B.is_sparse and ctx.needs_input_grad[2]):
+            raise ValueError(
+                'lobpcg.backward does not support sparse input yet.'
+                'Note that lobpcg.forward does though.'
+            )
+        if A.dtype in (torch.complex64, torch.complex128) or \
+           B is not None and B.dtype in (torch.complex64, torch.complex128):
+            raise ValueError(
+                'lobpcg.backward does not support complex input yet.'
+                'Note that lobpcg.forward does though.'
+            )
+        if B is not None:
+            raise ValueError(
+                'lobpcg.backward does not support backward with B != I yet.'
+            )
+
+        if largest is None:
+            largest = True
+
+        # symeig backward
+        if B is None:
+            A_grad = _symeig_backward(
+                D_grad, U_grad, A, D, U, largest
+            )
+
+        # A has index 0
+        grads[0] = A_grad
+        # B has index 2
+        grads[2] = B_grad
+        return tuple(grads)
+
+
+def lobpcg(A: Tensor,
+           k: Optional[int] = None,
+           B: Optional[Tensor] = None,
+           X: Optional[Tensor] = None,
+           n: Optional[int] = None,
+           iK: Optional[Tensor] = None,
+           niter: Optional[int] = None,
+           tol: Optional[float] = None,
+           largest: Optional[bool] = None,
+           method: Optional[str] = None,
+           tracker: Optional[None] = None,
+           ortho_iparams: Optional[Dict[str, int]] = None,
+           ortho_fparams: Optional[Dict[str, float]] = None,
+           ortho_bparams: Optional[Dict[str, bool]] = None
+           ) -> Tuple[Tensor, Tensor]:
 
     """Find the k largest (or smallest) eigenvalues and the corresponding
     eigenvectors of a symmetric positive defined generalized
@@ -53,6 +373,17 @@ def lobpcg(A,                   # type: Tensor
       not recommended but there exist cases where the usage of the
       basic method may be preferred.
 
+    .. warning:: The backward method does not support sparse and complex inputs.
+      It works only when `B` is not provided (i.e. `B == None`).
+      We are actively working on extensions, and the details of
+      the algorithms are going to be published promptly.
+
+    .. warning:: While it is assumed that `A` is symmetric, `A.grad` is not.
+      To make sure that `A.grad` is symmetric, so that `A - t * A.grad` is symmetric
+      in first-order optimization routines, prior to running `lobpcg`
+      we do the following symmetrization map: `A -> (A + A.t()) / 2`.
+      The map is performed only when the `A` requires gradients.
+
     Arguments:
 
       A (Tensor): the input tensor of size :math:`(*, m, m)`
@@ -175,6 +506,51 @@ def lobpcg(A,                   # type: Tensor
                 ortho_fparams=ortho_fparams,
                 ortho_bparams=ortho_bparams)
 
+    if not torch._jit_internal.is_scripting():
+        if A.requires_grad or (B is not None and B.requires_grad):
+            # While it is expected that `A` is symmetric,
+            # the `A_grad` might be not. Therefore we perform the trick below,
+            # so that `A_grad` becomes symmetric.
+            # The symmetrization is important for first-order optimization methods,
+            # so that (A - alpha * A_grad) is still a symmetric matrix.
+            # Same holds for `B`.
+            A_sym = (A + A.transpose(-2, -1)) / 2
+            B_sym = (B + B.transpose(-2, -1)) / 2 if (B is not None) else None
+
+            return LOBPCGAutogradFunction.apply(
+                A_sym, k, B_sym, X, n, iK, niter, tol, largest,
+                method, tracker, ortho_iparams, ortho_fparams, ortho_bparams
+            )
+    else:
+        if A.requires_grad or (B is not None and B.requires_grad):
+            raise RuntimeError(
+                'Script and require grads is not supported atm.'
+                'If you just want to do the forward, use .detach()'
+                'on A and B before calling into lobpcg'
+            )
+
+    return _lobpcg(
+        A, k, B, X,
+        n, iK, niter, tol, largest, method, tracker,
+        ortho_iparams, ortho_fparams, ortho_bparams
+    )
+
+def _lobpcg(A: Tensor,
+            k: Optional[int] = None,
+            B: Optional[Tensor] = None,
+            X: Optional[Tensor] = None,
+            n: Optional[int] = None,
+            iK: Optional[Tensor] = None,
+            niter: Optional[int] = None,
+            tol: Optional[float] = None,
+            largest: Optional[bool] = None,
+            method: Optional[str] = None,
+            tracker: Optional[None] = None,
+            ortho_iparams: Optional[Dict[str, int]] = None,
+            ortho_fparams: Optional[Dict[str, float]] = None,
+            ortho_bparams: Optional[Dict[str, bool]] = None
+            ) -> Tuple[Tensor, Tensor]:
+
     # A must be square:
     assert A.shape[-2] == A.shape[-1], A.shape
     if B is not None:
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 2a83aeca0de8..7caceff4a1d1 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -2342,6 +2342,18 @@ def callable(a, b) -> number
 Alias for :meth:`~Tensor.dim()`
 """)
 
+add_docstr_all('nan_to_num', r"""
+nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+See :func:`torch.nan_to_num`.
+""")
+
+add_docstr_all('nan_to_num_', r"""
+nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+In-place version of :meth:`~Tensor.nan_to_num`.
+""")
+
 add_docstr_all('ne', r"""
 ne(other) -> Tensor
 
@@ -3121,6 +3133,20 @@ def callable(a, b) -> number
 See :func:`torch.signbit`
 """)
 
+add_docstr_all('sgn',
+               r"""
+sgn() -> Tensor
+
+See :func:`torch.sgn`
+""")
+
+add_docstr_all('sgn_',
+               r"""
+sgn_() -> Tensor
+
+In-place version of :meth:`~Tensor.sgn`
+""")
+
 add_docstr_all('sin',
                r"""
 sin() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index d0f6f8c92151..6c641c3df140 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -515,6 +515,12 @@ def merge_dicts(*dicts):
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers
 
+.. warning::
+    This function is deprecated and may be removed in a future release.
+    It can be implemented using :func:`torch.outer` as
+    ``alpha * torch.outer(vec1, vec2) + beta * input`` when :attr:`beta` is not zero,
+    and as ``alpha * torch.outer(vec1, vec2)`` when :attr:`beta` is zero.
+
 Args:
     input (Tensor): matrix to be added
     vec1 (Tensor): the first vector of the outer product
@@ -2734,20 +2740,27 @@ def merge_dicts(*dicts):
     tensor([-1.,  1., -1., -1.])
 """.format(**common_args))
 
-add_docstr(torch.floor_divide,
-           r"""
+add_docstr(torch.floor_divide, r"""
 floor_divide(input, other, *, out=None) -> Tensor
 
-Return the division of the inputs rounded down to the nearest integer. See :func:`torch.div`
-for type promotion and broadcasting rules.
+.. warning::
+    This function's name is a misnomer. It actually rounds the
+    quotient towards zero instead of taking its floor. This behavior
+    will be deprecated in a future PyTorch release.
+
+Computes :attr:`input` divided by :attr:`other`, elementwise, and rounds each
+quotient towards zero. Equivalently, it truncates the quotient(s):
 
 .. math::
-    \text{{out}}_i = \left\lfloor \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right\rfloor
+    \text{{out}}_i = \text{trunc} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
 
 """ + r"""
+
+Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+
 Args:
-    input (Tensor): the numerator tensor
-    other (Tensor or Scalar): the denominator
+    input (Tensor or Number): the dividend
+    other (Tensor or Number): the divisor
 
 Keyword args:
     {out}
@@ -2982,13 +2995,6 @@ def merge_dicts(*dicts):
 add_docstr(torch.outer, r"""
 outer(input, vec2, *, out=None) -> Tensor
 
-Alias of :func:`torch.ger`.
-""")
-
-add_docstr(torch.ger,
-           r"""
-ger(input, vec2, *, out=None) -> Tensor
-
 Outer product of :attr:`input` and :attr:`vec2`.
 If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
 size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
@@ -3006,13 +3012,24 @@ def merge_dicts(*dicts):
 
     >>> v1 = torch.arange(1., 5.)
     >>> v2 = torch.arange(1., 4.)
-    >>> torch.ger(v1, v2)
+    >>> torch.outer(v1, v2)
     tensor([[  1.,   2.,   3.],
             [  2.,   4.,   6.],
             [  3.,   6.,   9.],
             [  4.,   8.,  12.]])
 """)
 
+add_docstr(torch.ger,
+           r"""
+ger(input, vec2, *, out=None) -> Tensor
+
+Alias of :func:`torch.outer`.
+
+.. warning::
+    This function is deprecated and will be removed in a future PyTorch release.
+    Use :func:`torch.outer` instead.
+""")
+
 add_docstr(torch.solve,
            r"""
 torch.solve(input, A, *, out=None) -> (Tensor, Tensor)
@@ -4947,8 +4964,14 @@ def merge_dicts(*dicts):
   1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
   The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
   must be broadcastable).  For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+  tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+
+  Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+  are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
   :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
-  tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor.
+  tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+  matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
 
 {tf32_note}
 
@@ -5278,6 +5301,41 @@ def merge_dicts(*dicts):
             [ 8,  9]])
 """)
 
+add_docstr(torch.nan_to_num,
+           r"""
+nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+
+Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+By default, :literal:`NaN`s are replaced with zero, positive infinity is replaced with the
+greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+is replaced with the least finite value representable by :attr:`input`'s dtype.
+
+Args:
+    {input}
+    nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+    posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+        If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+    neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+        If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+    >>> torch.nan_to_num(x)
+    tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0)
+    tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+    tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+
+""".format(**common_args))
+
 add_docstr(torch.ne, r"""
 ne(input, other, *, out=None) -> Tensor
 
@@ -5648,7 +5706,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.poisson,
            r"""
-poisson(input *, generator=None) -> Tensor
+poisson(input, generator=None) -> Tensor
 
 Returns a tensor of the same size as :attr:`input` with each element
 sampled from a Poisson distribution with rate parameter given by the corresponding
@@ -5847,7 +5905,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.qr,
            r"""
-qr(input, some=True, out=None) -> (Tensor, Tensor)
+qr(input, some=True, *, out=None) -> (Tensor, Tensor)
 
 Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
 and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
@@ -5875,6 +5933,8 @@ def merge_dicts(*dicts):
                 batch dimensions consisting of matrices of dimension :math:`m \times n`.
     some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
                 complete QR decomposition.
+
+Keyword args:
     out (tuple, optional): tuple of `Q` and `R` tensors
                 satisfying :code:`input = torch.matmul(Q, R)`.
                 The dimensions of `Q` and `R` are :math:`(*, m, k)` and :math:`(*, k, n)`
@@ -5911,7 +5971,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rad2deg,
            r"""
-rad2deg(input, out=None) -> Tensor
+rad2deg(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input`
 converted from angles in radians to degrees.
@@ -5934,7 +5994,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.deg2rad,
            r"""
-deg2rad(input, out=None) -> Tensor
+deg2rad(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input`
 converted from angles in degrees to radians.
@@ -5991,7 +6051,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rand,
            r"""
-rand(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+rand(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with random numbers from a uniform distribution
 on the interval :math:`[0, 1)`
@@ -6001,6 +6061,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -6018,7 +6080,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rand_like,
            r"""
-rand_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` that is filled with
 random numbers from a uniform distribution on the interval :math:`[0, 1)`.
@@ -6027,6 +6089,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6053,6 +6117,8 @@ def merge_dicts(*dicts):
     low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
     high (int): One above the highest integer to be drawn from the distribution.
     size (tuple): a tuple defining the shape of the output tensor.
+
+Keyword args:
     {generator}
     {out}
     {dtype}
@@ -6080,7 +6146,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randint_like,
            """
-randint_like(input, low=0, high, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+randint_like(input, low=0, high, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same shape as Tensor :attr:`input` filled with
@@ -6095,6 +6161,8 @@ def merge_dicts(*dicts):
     {input}
     low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
     high (int): One above the highest integer to be drawn from the distribution.
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6105,7 +6173,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randn,
            r"""
-randn(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+randn(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with random numbers from a normal distribution
 with mean `0` and variance `1` (also called the standard normal
@@ -6119,6 +6187,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -6136,7 +6206,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randn_like,
            r"""
-randn_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` that is filled with
 random numbers from a normal distribution with mean 0 and variance 1.
@@ -6145,6 +6215,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6155,12 +6227,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randperm,
            r"""
-randperm(n, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
+randperm(n, *, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
 
 Returns a random permutation of integers from ``0`` to ``n - 1``.
 
 Args:
     n (int): the upper bound (exclusive)
+
+Keyword args:
     {out}
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: ``torch.int64``.
@@ -6176,7 +6250,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tensor,
            r"""
-tensor(data, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Constructs a tensor with :attr:`data`.
 
@@ -6197,6 +6271,8 @@ def merge_dicts(*dicts):
 
 Args:
     {data}
+
+Keyword args:
     {dtype}
     {device}
     {requires_grad}
@@ -6227,7 +6303,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.range,
            r"""
-range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
 with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
@@ -6244,6 +6320,8 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points. Default: ``0``.
     end (float): the ending value for the set of points
     step (float): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
     {out}
     {dtype} If `dtype` is not given, infer the data type from the other input
         arguments. If any of `start`, `end`, or `stop` are floating-point, the
@@ -6264,7 +6342,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.arange,
            r"""
-arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
 with values from the interval ``[start, end)`` taken with common difference
@@ -6281,6 +6359,8 @@ def merge_dicts(*dicts):
     start (Number): the starting value for the set of points. Default: ``0``.
     end (Number): the ending value for the set of points
     step (Number): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
     {out}
     {dtype} If `dtype` is not given, infer the data type from the other input
         arguments. If any of `start`, `end`, or `stop` are floating-point, the
@@ -6303,7 +6383,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.remainder,
            r"""
-remainder(input, other, out=None) -> Tensor
+remainder(input, other, *, out=None) -> Tensor
 
 Computes the element-wise remainder of division.
 
@@ -6317,6 +6397,8 @@ def merge_dicts(*dicts):
     input (Tensor): the dividend
     other (Tensor or float): the divisor that may be either a number or a
                                Tensor of the same shape as the dividend
+
+Keyword args:
     {out}
 
 Example::
@@ -6334,7 +6416,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.renorm,
            r"""
-renorm(input, p, dim, maxnorm, out=None) -> Tensor
+renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
 
 Returns a tensor where each sub-tensor of :attr:`input` along dimension
 :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
@@ -6347,6 +6429,8 @@ def merge_dicts(*dicts):
     p (float): the power for the norm computation
     dim (int): the dimension to slice over to get the sub-tensors
     maxnorm (float): the maximum norm to keep each sub-tensor under
+
+Keyword args:
     {out}
 
 Example::
@@ -6420,13 +6504,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.round,
            r"""
-round(input, out=None) -> Tensor
+round(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input` rounded
 to the closest integer.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6440,7 +6526,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rsqrt,
            r"""
-rsqrt(input, out=None) -> Tensor
+rsqrt(input, *, out=None) -> Tensor
 
 Returns a new tensor with the reciprocal of the square-root of each of
 the elements of :attr:`input`.
@@ -6450,6 +6536,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6533,7 +6621,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.logit,
            r"""
-logit(input, eps=None, out=None) -> Tensor
+logit(input, eps=None, *, out=None) -> Tensor
 
 Returns a new tensor with the logit of the elements of :attr:`input`.
 :attr:`input` is clamped to [eps, 1 - eps] when eps is not None.
@@ -6551,6 +6639,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     eps (float, optional): the epsilon for input clamp bound. Default: ``None``
+
+Keyword args:
     {out}
 
 Example::
@@ -6564,7 +6654,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sign,
            r"""
-sign(input, out=None) -> Tensor
+sign(input, *, out=None) -> Tensor
 
 Returns a new tensor with the signs of the elements of :attr:`input`.
 
@@ -6573,6 +6663,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6603,9 +6695,34 @@ def merge_dicts(*dicts):
     tensor([ False, True,  False,  False])
 """.format(**common_args))
 
+add_docstr(torch.sgn,
+           r"""
+sgn(input, *, out=None) -> Tensor
+
+For complex tensors, this function returns a new tensor whose elemants have the same angle as that of the
+elements of :attr:`input` and absolute value 1. For a non-complex tensor, this function
+returns the signs of the elements of :attr:`input` (see :func:`torch.sign`).
+
+:math:`\text{out}_{i} = 0`, if :math:`|{\text{{input}}_i}| == 0`
+:math:`\text{out}_{i} = \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|}`, otherwise
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> x=torch.tensor([3+4j, 7-24j, 0, 1+2j])
+    >>> x.sgn()
+    tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+""".format(**common_args))
+
 add_docstr(torch.sin,
            r"""
-sin(input, out=None) -> Tensor
+sin(input, *, out=None) -> Tensor
 
 Returns a new tensor with the sine of the elements of :attr:`input`.
 
@@ -6614,6 +6731,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6627,7 +6746,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sinh,
            r"""
-sinh(input, out=None) -> Tensor
+sinh(input, *, out=None) -> Tensor
 
 Returns a new tensor with the hyperbolic sine of the elements of
 :attr:`input`.
@@ -6637,6 +6756,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6650,7 +6771,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sort,
            r"""
-sort(input, dim=-1, descending=False, out=None) -> (Tensor, LongTensor)
+sort(input, dim=-1, descending=False, *, out=None) -> (Tensor, LongTensor)
 
 Sorts the elements of the :attr:`input` tensor along a given dimension
 in ascending order by value.
@@ -6668,6 +6789,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+
+Keyword args:
     out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
         be optionally given to be used as output buffers
 
@@ -6729,7 +6852,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sparse_coo_tensor,
            r"""
-sparse_coo_tensor(indices, values, size=None, dtype=None, device=None, requires_grad=False) -> Tensor
+sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
 
 Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices`
 with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate
@@ -6747,6 +6870,8 @@ def merge_dicts(*dicts):
     size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
         provided the size will be inferred as the minimum size big enough to hold all non-zero
         elements.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if None, infers data type from :attr:`values`.
     device (:class:`torch.device`, optional): the desired device of returned tensor.
@@ -6806,7 +6931,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sqrt,
            r"""
-sqrt(input, out=None) -> Tensor
+sqrt(input, *, out=None) -> Tensor
 
 Returns a new tensor with the square-root of the elements of :attr:`input`.
 
@@ -6815,6 +6940,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6828,12 +6955,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.square,
            r"""
-square(input, out=None) -> Tensor
+square(input, *, out=None) -> Tensor
 
 Returns a new tensor with the square of the elements of :attr:`input`.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6847,7 +6976,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.squeeze,
            r"""
-squeeze(input, dim=None, out=None) -> Tensor
+squeeze(input, dim=None, *, out=None) -> Tensor
 
 Returns a tensor with all the dimensions of :attr:`input` of size `1` removed.
 
@@ -6871,6 +7000,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): if given, the input will be squeezed only in
            this dimension
+
+Keyword args:
     {out}
 
 Example::
@@ -7027,12 +7158,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sum,
            r"""
-sum(input, dtype=None) -> Tensor
+sum(input, *, dtype=None) -> Tensor
 
 Returns the sum of all elements in the :attr:`input` tensor.
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
 
 Example::
@@ -7043,7 +7176,7 @@ def merge_dicts(*dicts):
     >>> torch.sum(a)
     tensor(-0.5475)
 
-.. function:: sum(input, dim, keepdim=False, dtype=None) -> Tensor
+.. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
 
 Returns the sum of each row of the :attr:`input` tensor in the given
 dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
@@ -7055,6 +7188,8 @@ def merge_dicts(*dicts):
     {input}
     {dim}
     {keepdim}
+
+Keyword args:
     {dtype}
 
 Example::
@@ -7074,7 +7209,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.nansum,
            r"""
-nansum(input, dtype=None) -> Tensor
+nansum(input, *, dtype=None) -> Tensor
 
 Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
 
@@ -7090,7 +7225,7 @@ def merge_dicts(*dicts):
     >>> torch.nansum(a)
     tensor(7.)
 
-.. function:: nansum(input, dim, keepdim=False, dtype=None) -> Tensor
+.. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
 
 Returns the sum of each row of the :attr:`input` tensor in the given
 dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
@@ -7121,7 +7256,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.svd,
            r"""
-svd(input, some=True, compute_uv=True, out=None) -> (Tensor, Tensor, Tensor)
+svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
 
 This function returns a namedtuple ``(U, S, V)`` which is the singular value
 decomposition of a input real matrix or batches of real matrices :attr:`input` such that
@@ -7163,6 +7298,8 @@ def merge_dicts(*dicts):
                     batch dimensions consisting of :math:`m \times n` matrices.
     some (bool, optional): controls the shape of returned `U` and `V`
     compute_uv (bool, optional): option whether to compute `U` and `V` or not
+
+Keyword args:
     out (tuple, optional): the output tuple of tensors
 
 Example::
@@ -7197,7 +7334,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.symeig,
            r"""
-symeig(input, eigenvectors=False, upper=True, out=None) -> (Tensor, Tensor)
+symeig(input, eigenvectors=False, upper=True, *, out=None) -> (Tensor, Tensor)
 
 This function returns eigenvalues and eigenvectors
 of a real symmetric matrix :attr:`input` or a batch of real symmetric matrices,
@@ -7232,6 +7369,8 @@ def merge_dicts(*dicts):
                     batch dimensions consisting of symmetric matrices.
     eigenvectors(boolean, optional): controls whether eigenvectors have to be computed
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
+
+Keyword args:
     out (tuple, optional): the output tuple of (Tensor, Tensor)
 
 Returns:
@@ -7484,7 +7623,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tan,
            r"""
-tan(input, out=None) -> Tensor
+tan(input, *, out=None) -> Tensor
 
 Returns a new tensor with the tangent of the elements of :attr:`input`.
 
@@ -7493,6 +7632,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -7506,7 +7647,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tanh,
            r"""
-tanh(input, out=None) -> Tensor
+tanh(input, *, out=None) -> Tensor
 
 Returns a new tensor with the hyperbolic tangent of the elements
 of :attr:`input`.
@@ -7516,6 +7657,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -7529,7 +7672,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.topk,
            r"""
-topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor)
+topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
 
 Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
 a given dimension.
@@ -7552,6 +7695,8 @@ def merge_dicts(*dicts):
            smallest elements
     sorted (bool, optional): controls whether to return the elements
            in sorted order
+
+Keyword args:
     out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
         optionally given to be used as output buffers
 
@@ -7661,7 +7806,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tril,
            r"""
-tril(input, diagonal=0, out=None) -> Tensor
+tril(input, diagonal=0, *, out=None) -> Tensor
 
 Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
 :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
@@ -7680,6 +7825,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     diagonal (int, optional): the diagonal to consider
+
+Keyword args:
     {out}
 
 Example::
@@ -7716,7 +7863,7 @@ def merge_dicts(*dicts):
 # as common args.
 add_docstr(torch.tril_indices,
            r"""
-tril_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
 
 Returns the indices of the lower triangular part of a :attr:`row`-by-
 :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
@@ -7743,6 +7890,8 @@ def merge_dicts(*dicts):
     col (``int``): number of columns in the 2-D matrix.
     offset (``int``): diagonal offset from the main diagonal.
         Default: if not provided, 0.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
     {device}
@@ -7767,7 +7916,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.triu,
            r"""
-triu(input, diagonal=0, out=None) -> Tensor
+triu(input, diagonal=0, *, out=None) -> Tensor
 
 Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
 :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
@@ -7786,6 +7935,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     diagonal (int, optional): the diagonal to consider
+
+Keyword args:
     {out}
 
 Example::
@@ -7830,7 +7981,7 @@ def merge_dicts(*dicts):
 # as common args.
 add_docstr(torch.triu_indices,
            r"""
-triu_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
 
 Returns the indices of the upper triangular part of a :attr:`row` by
 :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
@@ -7857,6 +8008,8 @@ def merge_dicts(*dicts):
     col (``int``): number of columns in the 2-D matrix.
     offset (``int``): diagonal offset from the main diagonal.
         Default: if not provided, 0.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
     {device}
@@ -7887,13 +8040,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.trunc,
            r"""
-trunc(input, out=None) -> Tensor
+trunc(input, *, out=None) -> Tensor
 
 Returns a new tensor with the truncated integer values of
 the elements of :attr:`input`.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -8044,7 +8199,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.zeros,
            r"""
-zeros(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with the scalar value `0`, with the shape defined
 by the variable argument :attr:`size`.
@@ -8052,6 +8207,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8070,7 +8227,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.zeros_like,
            r"""
-zeros_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor filled with the scalar value `0`, with the same size as
 :attr:`input`. ``torch.zeros_like(input)`` is equivalent to
@@ -8083,6 +8240,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8099,7 +8258,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty,
            r"""
-empty(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Returns a tensor filled with uninitialized data. The shape of the tensor is
 defined by the variable argument :attr:`size`.
@@ -8107,6 +8266,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8126,7 +8287,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty_like,
            r"""
-empty_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns an uninitialized tensor with the same size as :attr:`input`.
 ``torch.empty_like(input)`` is equivalent to
@@ -8134,6 +8295,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8149,7 +8312,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty_strided,
            r"""
-empty_strided(size, stride, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Returns a tensor filled with uninitialized data. The shape and strides of the tensor is
 defined by the variable argument :attr:`size` and :attr:`stride` respectively.
@@ -8165,6 +8328,8 @@ def merge_dicts(*dicts):
 Args:
     size (tuple of ints): the shape of the output tensor
     stride (tuple of ints): the strides of the output tensor
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8193,6 +8358,8 @@ def merge_dicts(*dicts):
     size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
         shape of the output tensor.
     fill_value (Scalar): the value to fill the output tensor with.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8208,7 +8375,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.full_like,
            """
-full_like(input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
@@ -8218,6 +8385,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     fill_value: the number to fill the output tensor with.
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8463,7 +8632,13 @@ def merge_dicts(*dicts):
 add_docstr(torch.fft, r"""
 fft(input, signal_ndim, normalized=False) -> Tensor
 
-Complex-to-complex Discrete Fourier Transform
+Complex-to-complex Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.fft` is deprecated and will be removed in
+    PyTorch 1.8. Use the new :ref:`torch.fft <torch-fft-module>` module
+    functions, instead, by importing :ref:`torch.fft <torch-fft-module>` and
+    calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`.
 
 This method computes the complex-to-complex discrete Fourier transform.
 Ignoring the batch dimensions, it computes the following expression:
@@ -8563,11 +8738,16 @@ def merge_dicts(*dicts):
 
 """)
 
-add_docstr(torch.ifft,
-           r"""
+add_docstr(torch.ifft, r"""
 ifft(input, signal_ndim, normalized=False) -> Tensor
 
-Complex-to-complex Inverse Discrete Fourier Transform
+Complex-to-complex Inverse Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.ifft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`.
 
 This method computes the complex-to-complex inverse discrete Fourier
 transform. Ignoring the batch dimensions, it computes the following
@@ -8648,11 +8828,17 @@ def merge_dicts(*dicts):
 
 """)
 
-add_docstr(torch.rfft,
-           r"""
+add_docstr(torch.rfft, r"""
 rfft(input, signal_ndim, normalized=False, onesided=True) -> Tensor
 
-Real-to-complex Discrete Fourier Transform
+Real-to-complex Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.rfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.rfft` for one-sided output, or
+    :func:`torch.fft.fft` for two-sided output.
 
 This method computes the real-to-complex discrete Fourier transform. It is
 mathematically equivalent with :func:`~torch.fft` with differences only in
@@ -8717,11 +8903,17 @@ def merge_dicts(*dicts):
 """)
 
 
-add_docstr(torch.irfft,
-           r"""
+add_docstr(torch.irfft, r"""
 irfft(input, signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
 
-Complex-to-real Inverse Discrete Fourier Transform
+Complex-to-real Inverse Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.irfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.irfft` for one-sided input, or
+    :func:`torch.fft.ifft` for two-sided input.
 
 This method computes the complex-to-real inverse discrete Fourier transform.
 It is mathematically equivalent with :func:`ifft` with differences only in
@@ -8820,7 +9012,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.hann_window,
            """
-hann_window(window_length, periodic=True, dtype=None, \
+hann_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Hann window function.
@@ -8847,6 +9039,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -8861,7 +9055,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.hamming_window,
            """
-hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, dtype=None, \
+hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Hamming window function.
@@ -8892,6 +9086,8 @@ def merge_dicts(*dicts):
         function. If False, return a symmetric window.
     alpha (float, optional): The coefficient :math:`\alpha` in the equation above
     beta (float, optional): The coefficient :math:`\beta` in the equation above
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -8906,7 +9102,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.bartlett_window,
            """
-bartlett_window(window_length, periodic=True, dtype=None, \
+bartlett_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Bartlett window function.
@@ -8935,6 +9131,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -8949,7 +9147,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.blackman_window,
            """
-blackman_window(window_length, periodic=True, dtype=None, \
+blackman_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Blackman window function.
@@ -8975,6 +9173,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -9001,7 +9201,7 @@ def merge_dicts(*dicts):
     out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
 
 Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
-``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. 
+``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
 The :attr:`periodic` argument is intended as a helpful shorthand
 to produce a periodic window as input to functions like :func:`torch.stft`.
 
@@ -9404,7 +9604,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.searchsorted,
            r"""
-searchsorted(sorted_sequence, values, out_int32=False, right=False, out=None) -> Tensor
+searchsorted(sorted_sequence, values, *, out_int32=False, right=False, out=None) -> Tensor
 
 Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
 corresponding values in :attr:`values` were inserted before the indices, the order of the
@@ -9422,21 +9622,23 @@ def merge_dicts(*dicts):
      - *returned index satisfies*
    * - 1-D
      - False
-     - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+     - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
    * - 1-D
      - True
-     - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+     - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
    * - N-D
      - False
-     - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+     - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
    * - N-D
      - True
-     - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+     - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
 
 Args:
     sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
                               dimension.
     values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+
+Keyword args:
     out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
                                 Default value is False, i.e. default output data type is torch.int64.
     right (bool, optional): if False, return the first suitable location that is found. If True, return the
@@ -9479,7 +9681,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.bucketize,
            r"""
-bucketize(input, boundaries, out_int32=False, right=False, out=None) -> Tensor
+bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
 
 Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
 boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
@@ -9493,13 +9695,15 @@ def merge_dicts(*dicts):
    * - :attr:`right`
      - *returned index satisfies*
    * - False
-     - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
-   * - True
      - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+   * - True
+     - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
 
 Args:
     input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
     boundaries (Tensor): 1-D tensor, must contain a monotonically increasing sequence.
+
+Keyword args:
     out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
                                 Default value is False, i.e. default output data type is torch.int64.
     right (bool, optional): if False, return the first suitable location that is found. If True, return the
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 2a1d0ef55fd9..58e780c87d1b 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -381,15 +381,15 @@ def jacobian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if there are a single
-            input and output, this will be a single Tensor containing the
-            Jacobian for the linearized inputs and output. If one of the two is
-            a tuple, then the Jacobian will be a tuple of Tensors. If both of
-            them are tuples, then the Jacobian will be a tuple of tuple of
-            Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
-            ``i``\th output and ``j``\th input and will have as size the
-            concatenation of the sizes of the corresponding output and the
-            corresponding input.
+        Jacobian (Tensor or nested tuple of Tensors): if there is a single
+        input and output, this will be a single Tensor containing the
+        Jacobian for the linearized inputs and output. If one of the two is
+        a tuple, then the Jacobian will be a tuple of Tensors. If both of
+        them are tuples, then the Jacobian will be a tuple of tuple of
+        Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
+        ``i``\th output and ``j``\th input and will have as size the
+        concatenation of the sizes of the corresponding output and the
+        corresponding input.
 
     Example:
 
@@ -476,12 +476,12 @@ def hessian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors) if there are a single input,
-            this will be a single Tensor containing the Hessian for the input.
-            If it is a tuple, then the Hessian will be a tuple of tuples where
-            ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
-            and ``j``\th input with size the sum of the size of the ``i``\th input plus
-            the size of the ``j``\th input.
+        Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input,
+        this will be a single Tensor containing the Hessian for the input.
+        If it is a tuple, then the Hessian will be a tuple of tuples where
+        ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
+        and ``j``\th input with size the sum of the size of the ``i``\th input plus
+        the size of the ``j``\th input.
 
     Example:
 
@@ -660,7 +660,9 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):
             hvp for said inputs, which is the expected mathematical value.
             Defaults to ``False``.
     Returns:
-        func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
             hvp (tuple of Tensors or Tensor): result of the dot product with
             the same shape as the inputs.
 
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 4bcc3be1d85b..bbd96e941a54 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -22,7 +22,7 @@ def __call__(self, func: F) -> F:
 
         @functools.wraps(func)
         def decorate_context(*args, **kwargs):
-            with self:
+            with self.__class__():
                 return func(*args, **kwargs)
         return cast(F, decorate_context)
 
@@ -33,7 +33,7 @@ def generator_context(*args, **kwargs):
             gen = func(*args, **kwargs)
             while True:
                 try:
-                    with self:
+                    with self.__class__():
                         x = next(gen)
                     yield x
                 except StopIteration:
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 94b1aae844f1..8d33be090b27 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -105,6 +105,35 @@ def populate_cpu_children(self):
 
         self._cpu_children_populated = True
 
+    def set_backward_stacktraces(self):
+        self.populate_cpu_children()
+
+        def bw_parent(evt):
+            if evt is None:
+                return None
+            elif evt.scope == 1:
+                return evt
+            else:
+                return bw_parent(evt.cpu_parent)
+
+        fwd_stacks = {}
+        for evt in self:
+            if bw_parent(evt) is None:
+                t = (evt.sequence_nr, evt.thread)
+                if t not in fwd_stacks:
+                    fwd_stacks[t] = evt.stack
+
+        for evt in self:
+            p = bw_parent(evt)
+            if p is not None:
+                assert p.fwd_thread is not None
+                t = (p.sequence_nr, p.fwd_thread)
+                if t in fwd_stacks:
+                    evt.stack = fwd_stacks[t]
+                else:
+                    evt.stack = []
+
+
     @property
     def self_cpu_time_total(self):
         return sum([event.self_cpu_time_total for event in self])
@@ -208,14 +237,17 @@ def export_chrome_trace(self, path):
             f.truncate()
             f.write("]")
 
-    def key_averages(self, group_by_input_shapes=False):
+    def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
         """Averages all function events over their keys.
 
-        @param group_by_input_shapes The key would become
-        (event name, input dimensions) rather than just event name.
-        This is useful to see which dimensionality contributes to the runtime
-        the most and may help with dimension specific optimizations or
-        choosing best candidates for quantization (aka fitting a roof line)
+        Arguments:
+            group_by_input_shapes: group entries by
+            (event name, input shapes) rather than just event name.
+            This is useful to see which input shapes contribute to the runtime
+            the most and may help with size-specific optimizations or
+            choosing the best candidates for quantization (aka fitting a roof line)
+
+            group_by_stack_n: group by top n stack trace entries
 
         Returns:
             An EventList containing FunctionEventAvg objects.
@@ -223,14 +255,22 @@ def key_averages(self, group_by_input_shapes=False):
         self.populate_cpu_children()
         stats = defaultdict(FunctionEventAvg)
 
-        def get_key(event, group_by_input_shapes):
-            if not group_by_input_shapes:
-                return (event.key, event.node_id)
-            return (event.key, str(event.input_shapes), event.node_id)
+        def get_key(event, group_by_input_shapes, group_by_stack_n):
+            key = [str(event.key), str(event.node_id)]
+            if group_by_input_shapes:
+                key.append(str(event.input_shapes))
+            if group_by_stack_n > 0:
+                key += event.stack[:group_by_stack_n]
+            return tuple(key)
         for evt in self:
-            stats[get_key(evt, group_by_input_shapes)].add(
-                evt, group_by_input_shapes)
-        return EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory)
+            stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
+
+        avg_list = EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory)
+        for evt in avg_list:
+            evt.stack = evt.stack[:group_by_stack_n]
+            if not group_by_input_shapes:
+                evt.input_shapes = ""
+        return avg_list
 
     def total_average(self):
         """Averages all events.
@@ -274,8 +314,11 @@ class profile(object):
 
         profile_memory (bool, optional): Whether to report memory usage, default: ``False``
 
+        with_stack (bool, optional): record source information (file and line number) for the ops
+
     .. warning:
-        Enabling memory profiling incurs additional profiler overhead
+        Enabling memory profiling or source attribution incurs additional profiler
+        overhead
 
     .. warning:
         This context managers should not be called recursively, i.e. no nested
@@ -311,7 +354,8 @@ def __init__(
             enabled=True,
             use_cuda=False,
             record_shapes=False,
-            profile_memory=False):
+            profile_memory=False,
+            with_stack=False):
         self.enabled = enabled
         self.use_cuda = use_cuda
         self.function_events = None
@@ -320,6 +364,7 @@ def __init__(
         self.entered = False
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
+        self.with_stack = with_stack
 
     def __enter__(self):
         if not self.enabled:
@@ -330,7 +375,11 @@ def __enter__(self):
         profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \
             else torch.autograd.ProfilerState.CPU
 
-        config = torch.autograd.ProfilerConfig(profiler_kind, self.record_shapes, self.profile_memory)
+        config = torch.autograd.ProfilerConfig(
+            profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack)
         torch.autograd._enable_profiler(config)
         return self
 
@@ -339,9 +388,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return
         records = torch.autograd._disable_profiler()
         self.function_events = EventList(
-            parse_cpu_trace(records),
+            parse_event_records(records),
             use_cuda=self.use_cuda,
             profile_memory=self.profile_memory)
+        if self.with_stack:
+            self.function_events.set_backward_stacktraces()
         return False
 
     def __repr__(self):
@@ -373,9 +424,9 @@ def export_chrome_trace(self, path):
         return self.function_events.export_chrome_trace(path)
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
-    def key_averages(self, group_by_input_shape=False):
+    def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
-        return self.function_events.key_averages(group_by_input_shape)
+        return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
     key_averages.__doc__ = EventList.key_averages.__doc__
 
     def total_average(self):
@@ -568,8 +619,8 @@ def __enter__(self):
             torch.autograd.ProfilerConfig(
                 torch.autograd.ProfilerState.NVTX,
                 self.record_shapes,
-                False
-            )
+                False,
+                False)
         )
         return self
 
@@ -639,6 +690,7 @@ class FormattedTimesMixin(object):
     cpu_time_total_str = attr_formatter('cpu_time_total')
     cuda_time_total_str = attr_formatter('cuda_time_total')
     self_cpu_time_total_str = attr_formatter('self_cpu_time_total')
+    self_cuda_time_total_str = attr_formatter('self_cuda_time_total')
 
     @property
     def cpu_time(self):
@@ -664,19 +716,22 @@ def elapsed_us(self):
 class FunctionEvent(FormattedTimesMixin):
     """Profiling information about a single function."""
     def __init__(
-            self, id, node_id, name, thread, cpu_start, cpu_end, input_shapes=None,
-            cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, is_remote=True,
-            sequence_nr=-1):
+            self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None,
+            stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
+            is_remote=True, sequence_nr=-1):
         self.id = id
         self.node_id = node_id
         self.name = name
         self.cpu_interval = Interval(cpu_start, cpu_end)
         self.thread = thread
+        self.fwd_thread = fwd_thread
         self.kernels = []
         self.count = 1
         self.cpu_children = []
         self.cpu_parent = None
         self.input_shapes = input_shapes
+        self.stack = stack
+        self.scope = scope
         self.cpu_memory_usage = cpu_memory_usage
         self.cuda_memory_usage = cuda_memory_usage
         self.is_async = is_async
@@ -735,6 +790,11 @@ def self_cpu_time_total(self):
     def cuda_time_total(self):
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
 
+    @property
+    def self_cuda_time_total(self):
+        return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \
+            sum([child.cuda_time_total for child in self.cpu_children])
+
     @property
     def cpu_time_total(self):
         return self.cpu_interval.elapsed_us()
@@ -778,7 +838,10 @@ def __init__(self):
         self.cpu_time_total = 0
         self.cuda_time_total = 0
         self.self_cpu_time_total = 0
+        self.self_cuda_time_total = 0
         self.input_shapes = None
+        self.stack = None
+        self.scope = None
         self.cpu_memory_usage = 0
         self.cuda_memory_usage = 0
         self.self_cpu_memory_usage = 0
@@ -786,7 +849,7 @@ def __init__(self):
         self.cpu_children = None
         self.cpu_parent = None
 
-    def add(self, other, group_by_input_shapes=False):
+    def add(self, other):
         if self.key is None:
             # First function being recorded as part of FunctionEventAvg, propagate
             # fields.
@@ -796,18 +859,17 @@ def add(self, other, group_by_input_shapes=False):
             self.is_remote = other.is_remote
             self.cpu_parent = other.cpu_parent
             self.cpu_children = other.cpu_children
-            if group_by_input_shapes:
-                self.input_shapes = other.input_shapes
 
-        assert (
-            not group_by_input_shapes or
-            other.input_shapes == self.input_shapes
-        )
+            self.input_shapes = other.input_shapes
+            self.stack = other.stack
+            self.scope = other.scope
+
         assert isinstance(other, (FunctionEvent, FunctionEventAvg))
         assert other.key == self.key
         self.cpu_time_total += other.cpu_time_total
         self.cuda_time_total += other.cuda_time_total
         self.self_cpu_time_total += other.self_cpu_time_total
+        self.self_cuda_time_total += other.self_cuda_time_total
         self.cpu_memory_usage += other.cpu_memory_usage
         self.cuda_memory_usage += other.cuda_memory_usage
         self.self_cpu_memory_usage += other.self_cpu_memory_usage
@@ -821,11 +883,12 @@ def __iadd__(self, other):
     def __repr__(self):
         return (
             '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '
-            'cuda_time={} input_shapes={}> '
-            'cpu_memory_usage={} cuda_memory_usage={}'.format(
+            ' self_cuda_time={} cuda_time={} input_shapes={} '
+            'cpu_memory_usage={} cuda_memory_usage={}>'.format(
                 self.key,
                 self.self_cpu_time_total_str,
                 self.cpu_time_str,
+                self.self_cuda_time_total_str,
                 self.cuda_time_str,
                 str(self.input_shapes),
                 self.cpu_memory_usage,
@@ -845,14 +908,10 @@ def __missing__(self, key):
         self[key] = torch._C._demangle(key) if len(key) > 1 else key
         return self[key]
 
-
-################################################################################
-# CPU checkpoints
-
-def parse_cpu_trace(thread_records):
+def parse_event_records(thread_records):
     def get_record_key(record):
         """
-        Returns a tuple to be used by parse_cpu_trace for correlating start and
+        Returns a tuple to be used by parse_event_records for correlating start and
         end records.
         """
         return (record.handle(), record.node_id())
@@ -873,6 +932,17 @@ def get_record_key(record):
         "aten::_version",
     ]
 
+    def filter_stack_entry(entry):
+        filtered_entries = [
+            ("autograd/__init__", "_make_grads"),
+            ("autograd/__init__", "backward"),
+            ("torch/tensor", "backward"),
+            ("_internal/common_utils", "prof_callable"),
+            ("_internal/common_utils", "prof_func_call"),
+            ("_internal/common_utils", "prof_meth_call"),
+        ]
+        return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
+
     # cuda start events and the overall profiler start event don't happen
     # at exactly the same time because we need to record an event on each device
     # and each record takes ~4us. So we adjust here by the difference
@@ -951,7 +1021,10 @@ def adjusted_time(cuda_record, cuda_records_map):
                     thread=start.thread_id(),
                     cpu_start=start_record.cpu_elapsed_us(start),
                     cpu_end=start_record.cpu_elapsed_us(record),
+                    fwd_thread=start.fwd_thread_id(),
                     input_shapes=start.shapes(),
+                    stack=[entry for entry in start.stack() if filter_stack_entry(entry)],
+                    scope=start.scope(),
                     cpu_memory_usage=cpu_memory_usage,
                     cuda_memory_usage=cuda_memory_usage,
                     is_async=is_async,
@@ -1088,22 +1161,37 @@ def build_table(
         ), use_cuda=use_cuda, profile_memory=profile_memory)
 
     has_input_shapes = any(
-        [event.input_shapes is not None for event in events])
+        [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
+
     name_column_width = max([len(evt.key) for evt in events]) + 4
-    DEFAULT_COLUMN_WIDTH = 15
-    SHAPES_COLUMN_WIDTH = 45
+
+    DEFAULT_COLUMN_WIDTH = 12
+
+    shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
+    shapes_column_width = min(shapes_column_width, 45)
+
+    src_column_width = None
+    stacks = []
+    for evt in events:
+        if evt.stack is not None and len(evt.stack) > 0:
+            stacks.append(evt.stack)
+    has_stack = len(stacks) > 0
+    if has_stack:
+        src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
+        src_column_width = min(src_column_width, 75)
 
     headers = [
         'Name',
-        'Self CPU total %',
-        'Self CPU total',
+        'Self CPU %',
+        'Self CPU',
         'CPU total %',
         'CPU total',
         'CPU time avg',
     ]
     if use_cuda:
         headers.extend([
-            'CUDA total %',
+            'Self CUDA',
+            'Self CUDA %',
             'CUDA total',
             'CUDA time avg',
         ])
@@ -1118,7 +1206,7 @@ def build_table(
                 'Self CUDA Mem',
             ])
     headers.append(
-        'Number of Calls'
+        '# of Calls'
     )
     # Only append Node ID if any event has a valid (>= 0) Node ID
     append_node_id = any([evt.node_id != -1 for evt in events])
@@ -1130,10 +1218,11 @@ def build_table(
     row_format = [""]
     header_sep = [""]
     line_length = [-SPACING_SIZE]
+    MAX_STACK_ENTRY = 5
 
-    def add_column(padding):
-        row_format[0] += '{: <' + str(padding) + '}  '
-        header_sep[0] += '-' * padding + '  '
+    def add_column(padding, text_dir='>'):
+        row_format[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
+        header_sep[0] += '-' * padding + (' ' * SPACING_SIZE)
         line_length[0] += padding + SPACING_SIZE
 
     add_column(name_column_width)
@@ -1142,7 +1231,11 @@ def add_column(padding):
 
     if has_input_shapes:
         headers.append('Input Shapes')
-        add_column(SHAPES_COLUMN_WIDTH)
+        add_column(shapes_column_width)
+
+    if has_stack:
+        headers.append('Source Location')
+        add_column(src_column_width, text_dir='<')
 
     row_format = row_format[0]
     header_sep = header_sep[0]
@@ -1157,7 +1250,7 @@ def append(s):
         result.append('\n')  # Yes, newline after the end as well
 
     self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
-    cuda_time_total = sum([evt.cuda_time_total for evt in events])
+    cuda_time_total = sum([evt.self_cuda_time_total for evt in events])
     # Actual printing
     if header is not None:
         append('=' * line_length)
@@ -1191,8 +1284,9 @@ def append(s):
         ]
         if use_cuda:
             row_values.extend([
+                evt.self_cuda_time_total_str,
                 # CUDA time total %
-                format_time_share(evt.cuda_time_total, cuda_time_total),
+                format_time_share(evt.self_cuda_time_total, cuda_time_total),
                 evt.cuda_time_total_str,
                 evt.cuda_time_str,  # Cuda time avg
             ])
@@ -1217,9 +1311,21 @@ def append(s):
         if append_node_id:
             row_values.append(evt.node_id)
         if has_input_shapes:
-            row_values.append(str(evt.input_shapes)[:SHAPES_COLUMN_WIDTH])
+            row_values.append(str(evt.input_shapes)[:shapes_column_width])
+        if has_stack:
+            src_field = ""
+            if len(evt.stack) > 0:
+                src_field = evt.stack[0][:src_column_width]
+            row_values.append(src_field)
         append(row_format.format(*row_values))
 
+        if has_stack:
+            empty_headers = [""] * (len(headers) - 1)
+            for entry in evt.stack[1:MAX_STACK_ENTRY]:
+                append(row_format.format(*(empty_headers + [entry[:src_column_width]])))
+            empty_headers.append("")
+            append(row_format.format(*empty_headers))
+
     append(header_sep)
     append("Self CPU time total: {}".format(format_time(self_cpu_time_total)))
     if use_cuda:
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 6969ac0449c0..f7e48c3b682d 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -61,9 +61,8 @@ PyTypeObject* getPyTypeObject(
     const at::Storage& storage,
     const caffe2::TypeMeta& dtype) {
   at::ScalarType scalarType = at::typeMetaToScalarType(dtype);
-  at::TensorOptions options = at::TensorOptions(storage.device_type()).dtype(scalarType);
   auto attype = &at::getDeprecatedTypeProperties(
-      at::dispatchKeyToBackend(at::computeDispatchKey(options)),
+      at::dispatchKeyToBackend(c10::computeDispatchKey(scalarType, c10::nullopt, storage.device_type())),
       scalarType);
   auto it = attype_to_py_storage_type.find(attype);
   if (it != attype_to_py_storage_type.end()) {
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ed4aa21a8f76..6f61b5e0a2d9 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -61,6 +61,12 @@
 #endif
 #endif
 
+#if (defined(_WIN32) || defined(_WIN64) || defined(FBCODE_CAFFE2) || defined(C10_MOBILE))
+#define NVALGRIND
+#else
+#include <callgrind.h>
+#endif
+
 #define WITH_NUMPY_IMPORT_ARRAY
 #include <torch/csrc/utils/numpy_stub.h>
 
@@ -127,6 +133,7 @@ static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manag
   THPByteStorage_postInit(module);
   THPBoolStorage_postInit(module);
   THPQUInt8Storage_postInit(module);
+  THPQUInt4x2Storage_postInit(module);
   THPQInt8Storage_postInit(module);
   THPQInt32Storage_postInit(module);
   THPBFloat16Storage_postInit(module);
@@ -527,12 +534,12 @@ PyObject *THPModule_setQEngine(PyObject */* unused */, PyObject *arg)
   Py_RETURN_NONE;
 }
 
-PyObject *THPModule_qEngine(PyObject */* unused */)
+PyObject *THPModule_qEngine(PyObject *_unused, PyObject *noargs)
 {
   return THPUtils_packInt64(static_cast<int>(at::globalContext().qEngine()));
 }
 
-PyObject *THPModule_supportedQEngines(PyObject */* unused */)
+PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs)
 {
   auto qengines = at::globalContext().supportedQEngines();
   auto list = THPObjectPtr(PyList_New(qengines.size()));
@@ -546,7 +553,7 @@ PyObject *THPModule_supportedQEngines(PyObject */* unused */)
   return list.release();
 }
 
-PyObject *THPModule_isEnabledXNNPACK(PyObject * /* unused */)
+PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs)
 {
   if (at::globalContext().isXNNPACKAvailable()) Py_RETURN_TRUE;
   else Py_RETURN_FALSE;
@@ -567,52 +574,52 @@ static PyObject * THPModule_vmapmode_decrement_nesting(PyObject* _unused, PyObje
 //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
 static PyMethodDef TorchMethods[] = {
   {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       nullptr},
-  {"_autograd_init",  (PyCFunction)THPAutograd_initExtension, METH_NOARGS,  nullptr},
+  {"_autograd_init",  THPAutograd_initExtension, METH_NOARGS,  nullptr},
   {"_add_docstr",     (PyCFunction)THPModule_addDocStr,       METH_VARARGS, nullptr},
   {"_init_names",     (PyCFunction)THPModule_initNames,       METH_O,       nullptr},
-  {"_has_distributed",(PyCFunction)THPModule_hasDistributed,  METH_NOARGS,  nullptr},
+  {"_has_distributed",THPModule_hasDistributed,  METH_NOARGS,  nullptr},
   {"_set_default_tensor_type", (PyCFunction)THPModule_setDefaultTensorType, METH_O, nullptr},
   {"_set_default_dtype", (PyCFunction)THPModule_setDefaultDtype, METH_O, nullptr},
   {"_infer_size",     (PyCFunction)THPModule_inferSize,         METH_VARARGS, nullptr},
   {"_crash_if_csrc_asan", (PyCFunction)THPModule_crashIfCsrcASAN, METH_O, nullptr},
   {"_crash_if_csrc_ubsan", (PyCFunction)THPModule_crashIfCsrcUBSAN, METH_O, nullptr},
   {"_crash_if_aten_asan", (PyCFunction)THPModule_crashIfATenASAN, METH_O, nullptr},
-  {"_show_config",    (PyCFunction)THPModule_showConfig, METH_NOARGS, nullptr},
-  {"_parallel_info",    (PyCFunction)THPModule_parallelInfo, METH_NOARGS, nullptr},
+  {"_show_config",    THPModule_showConfig, METH_NOARGS, nullptr},
+  {"_parallel_info",    THPModule_parallelInfo, METH_NOARGS, nullptr},
   {"_set_backcompat_broadcast_warn", (PyCFunction)THPModule_setBackcompatBroadcastWarn, METH_O, nullptr},
-  {"_get_backcompat_broadcast_warn", (PyCFunction)THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr},
+  {"_get_backcompat_broadcast_warn", THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr},
   {"_set_backcompat_keepdim_warn", (PyCFunction)THPModule_setBackcompatKeepdimWarn, METH_O, nullptr},
-  {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr},
-  {"get_num_threads", (PyCFunction)THPModule_getNumThreads,     METH_NOARGS,  nullptr},
+  {"_get_backcompat_keepdim_warn", THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr},
+  {"get_num_threads", THPModule_getNumThreads,     METH_NOARGS,  nullptr},
   {"set_num_threads", (PyCFunction)THPModule_setNumThreads,     METH_O,       nullptr},
-  {"get_num_interop_threads", (PyCFunction)THPModule_getNumInteropThreads,     METH_NOARGS,  nullptr},
+  {"get_num_interop_threads", THPModule_getNumInteropThreads,     METH_NOARGS,  nullptr},
   {"set_num_interop_threads", (PyCFunction)THPModule_setNumInteropThreads,     METH_O,       nullptr},
-  {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_enabled", THPModule_userEnabledCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O,  nullptr},
-  {"_get_mkldnn_enabled", (PyCFunction)THPModule_userEnabledMkldnn, METH_NOARGS,     nullptr},
+  {"_get_mkldnn_enabled", THPModule_userEnabledMkldnn, METH_NOARGS,     nullptr},
   {"_set_mkldnn_enabled", (PyCFunction)THPModule_setUserEnabledMkldnn, METH_O,  nullptr},
-  {"_get_cudnn_allow_tf32", (PyCFunction)THPModule_allowTF32CuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_allow_tf32", THPModule_allowTF32CuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuDNN, METH_O,  nullptr},
-  {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_benchmark", (PyCFunction)THPModule_setBenchmarkCuDNN, METH_O,  nullptr},
-  {"_get_cudnn_deterministic", (PyCFunction)THPModule_deterministicCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_deterministic", THPModule_deterministicCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_deterministic", (PyCFunction)THPModule_setDeterministicCuDNN, METH_O,  nullptr},
-  {"_get_deterministic", (PyCFunction)THPModule_deterministic, METH_NOARGS,     nullptr},
+  {"_get_deterministic", THPModule_deterministic, METH_NOARGS,     nullptr},
   {"_set_deterministic", (PyCFunction)THPModule_setDeterministic, METH_O,  nullptr},
-  {"_get_cublas_allow_tf32", (PyCFunction)THPModule_allowTF32CuBLAS, METH_NOARGS,     nullptr},
+  {"_get_cublas_allow_tf32", THPModule_allowTF32CuBLAS, METH_NOARGS,     nullptr},
   {"_set_cublas_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuBLAS, METH_O,  nullptr},
-  {"_vmapmode_increment_nesting", (PyCFunction)THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr},
-  {"_vmapmode_decrement_nesting", (PyCFunction)THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr},
+  {"_vmapmode_increment_nesting", THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr},
+  {"_vmapmode_decrement_nesting", THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr},
   {"_to_dlpack",      (PyCFunction)THPModule_toDLPack,          METH_O,       nullptr},
   {"_from_dlpack",    (PyCFunction)THPModule_fromDLPack,        METH_O,       nullptr},
   {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O,     nullptr},
-  {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS,  nullptr},
-  {"_get_default_device", (PyCFunction)THPModule_getDefaultDevice, METH_NOARGS,   nullptr},
-  {"_get_qengine", (PyCFunction)THPModule_qEngine, METH_NOARGS, nullptr},
+  {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS,  nullptr},
+  {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS,   nullptr},
+  {"_get_qengine", THPModule_qEngine, METH_NOARGS, nullptr},
   {"_set_qengine", (PyCFunction)THPModule_setQEngine, METH_O, nullptr},
-  {"_supported_qengines", (PyCFunction)THPModule_supportedQEngines, METH_NOARGS, nullptr},
-  {"_is_xnnpack_enabled", (PyCFunction)THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
-  {"_is_torch_function_enabled", (PyCFunction)THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr},
+  {"_supported_qengines", THPModule_supportedQEngines, METH_NOARGS, nullptr},
+  {"_is_xnnpack_enabled", THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
+  {"_is_torch_function_enabled", THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr},
   {"_disabled_torch_function_impl", (PyCFunction)THPModule_disable_torch_function, METH_VARARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
@@ -688,9 +695,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
-#ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
+#ifndef _WIN32
   THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::autograd::python_functions());
@@ -746,6 +753,7 @@ PyObject* initModule() {
   ASSERT_TRUE(THPQUInt8Storage_init(module));
   ASSERT_TRUE(THPQInt8Storage_init(module));
   ASSERT_TRUE(THPQInt32Storage_init(module));
+  ASSERT_TRUE(THPQUInt4x2Storage_init(module));
   ASSERT_TRUE(THPBFloat16Storage_init(module));
   ASSERT_TRUE(THPComplexDoubleStorage_init(module));
   ASSERT_TRUE(THPComplexFloatStorage_init(module));
@@ -821,6 +829,26 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
+  py_module.def(
+    "valgrind_supported_platform", [](){
+      #if defined(NVALGRIND)
+      return false;
+      #else
+      return true;
+      #endif
+    }
+  );
+
+  py_module.def(
+    "valgrind_toggle", [](){
+      #if defined(NVALGRIND)
+      TORCH_CHECK(false, "Valgrind is not supported.");
+      #else
+      CALLGRIND_TOGGLE_COLLECT;
+      #endif
+    }
+  );
+
 #ifdef USE_CUDA
   PyObject *has_cuda = Py_True;
 #else
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 5e708f2b4f2d..e7c8bfdbe4f2 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -35,7 +35,8 @@
     PyObject_IsInstance(obj, THPComplexDoubleStorageClass)
 #define THPComplexFloatStorage_Check(obj) \
     PyObject_IsInstance(obj, THPComplexFloatStorageClass)
-
+#define THPQUInt4x2Storage_Check(obj) \
+    PyObject_IsInstance(obj, THPQUInt8StorageClass)
 
 #define THPDoubleStorage_CData(obj)         (obj)->cdata
 #define THPFloatStorage_CData(obj)          (obj)->cdata
@@ -52,6 +53,7 @@
 #define THPBFloat16Storage_CData(obj)       (obj)->cdata
 #define THPComplexDoubleStorage_CData(obj)  (obj)->cdata
 #define THPComplexFloatStorage_CData(obj)   (obj)->cdata
+#define THPQUInt4x2Storage_CData(obj)       (obj)->cdata
 
 #define THPStorageType TH_CONCAT_3(THP,Real,StorageType)
 #define THPStorageBaseStr TH_CONCAT_STRING_2(Real,StorageBase)
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 7f8ef4e01677..7f44db0baba9 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -5,3 +5,9 @@
 // There's no difference between aten, torch and caffe2 libs any more
 // TODO: clean up the naming for consistency
 #define TORCH_API CAFFE2_API
+
+#ifdef _WIN32
+#define TORCH_PYTHON_API
+#else
+#define TORCH_PYTHON_API CAFFE2_API
+#endif
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 9622f668214f..8a094ec9e235 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -35,6 +35,36 @@ inline Tensor ifft(const Tensor& self,
   return torch::fft_ifft(self, n, dim, norm);
 }
 
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fftn(t);
+/// ```
+inline Tensor fftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_fftn(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifftn(t);
+/// ```
+inline Tensor ifftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_ifftn(self, s, dim, norm);
+}
+
 /// Computes the 1 dimensional FFT of real input with onesided Hermitian output.
 /// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft.
 ///
@@ -69,6 +99,36 @@ inline Tensor irfft(const Tensor& self,
   return torch::fft_irfft(self, n, dim, norm);
 }
 
+/// Computes the N dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfftn(t);
+/// ```
+inline Tensor rfftn(const Tensor& self,
+                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    c10::optional<IntArrayRef> dim=c10::nullopt,
+                    c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_rfftn(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfftn.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfftn(t);
+/// ```
+inline Tensor irfftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_irfftn(self, s, dim, norm);
+}
+
 /// Computes the 1 dimensional FFT of a onesided Hermitian signal
 ///
 /// The input represents a Hermitian symmetric time domain signal. The returned
@@ -106,4 +166,66 @@ inline Tensor ihfft(const Tensor& self,
   return torch::fft_ihfft(self, n, dim, norm);
 }
 
+/// Computes the discrete Fourier Transform sample frequencies for a signal of size n.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble);
+/// ```
+inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, d, options);
+}
+
+inline Tensor fftfreq(int64_t n, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, /*d=*/1.0, options);
+}
+
+/// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
+///
+/// Like torch.fft.rfft, only the positive frequencies are included.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble);
+/// ```
+inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, d, options);
+}
+
+inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, /*d=*/1.0, options);
+}
+
+/// Reorders n-dimensional FFT output to have negative frequency terms first, by
+/// a torch.roll operation.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
+/// ```
+inline Tensor fftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_fftshift(x, dim);
+}
+
+/// Inverse of torch.fft.fftshift
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto shift = torch::fft::fftshift(x)
+/// auto unshift = torch::fft::ifftshift(shift);
+/// assert(torch::allclose(x, unshift));
+/// ```
+inline Tensor ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_ifftshift(x, dim);
+}
+
 }} // torch::fft
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index b5a06f4cfb14..6ed3c37311c0 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -307,9 +307,9 @@ inline Tensor cosine_embedding_loss(
 
 // ============================================================================
 
-inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target) {
+inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target, double beta = 1.) {
     auto t = torch::abs(input - target);
-    return torch::where(t < 1, 0.5 * torch::pow(t, 2), t - 0.5);
+    return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
@@ -317,7 +317,8 @@ namespace detail {
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    SmoothL1LossFuncOptions::reduction_t reduction) {
+    SmoothL1LossFuncOptions::reduction_t reduction,
+    double beta = 1.) {
   if (target.sizes() != input.sizes()) {
     TORCH_WARN("Using a target size (", target.sizes(), ") that is different to the input size (", input.sizes(), "). ",
                   "This will likely lead to incorrect results due to broadcasting. ",
@@ -325,7 +326,7 @@ inline Tensor smooth_l1_loss(
   }
 
   std::vector<Tensor> expanded_tensors = torch::broadcast_tensors({input, target});
-  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction));
+  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction), beta);
 }
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
@@ -344,8 +345,9 @@ inline Tensor smooth_l1_loss(
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    const SmoothL1LossFuncOptions& options = {}) {
-  return detail::smooth_l1_loss(input, target, options.reduction());
+    const SmoothL1LossFuncOptions& options = {},
+    double beta = 1.) {
+  return detail::smooth_l1_loss(input, target, options.reduction(), beta);
 }
 
 // ============================================================================
@@ -525,6 +527,85 @@ inline Tensor triplet_margin_loss(
 
 // ============================================================================
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    c10::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t> distance_function,
+    double margin,
+    bool swap,
+    TripletMarginWithDistanceLossFuncOptions::reduction_t reduction) {
+  Tensor dist_pos, dist_neg;
+  if (distance_function.has_value()) {
+    auto distance_function_impl = distance_function.value();
+    dist_pos = distance_function_impl(anchor, positive);
+    dist_neg = distance_function_impl(anchor, negative);
+  } else {
+    dist_pos = pairwise_distance(anchor, positive);
+    dist_neg = pairwise_distance(anchor, negative);
+  }
+
+  if (swap) {
+    Tensor dist_swap;
+    if (distance_function.has_value()) {
+      dist_swap = distance_function.value()(positive, negative);
+    } else {
+      dist_swap = pairwise_distance(positive, negative);
+    }
+    dist_neg = torch::min(dist_neg, dist_swap);
+  }
+
+  auto loss = torch::clamp_min(dist_pos - dist_neg + margin, 0);
+
+  Tensor ret;
+  if (c10::get_if<enumtype::kNone>(&reduction)) {
+    ret = loss;
+  } else if (c10::get_if<enumtype::kMean>(&reduction)) {
+    ret = loss.mean();
+  } else if (c10::get_if<enumtype::kSum>(&reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = anchor;
+    TORCH_INTERNAL_ASSERT(
+      false,
+      enumtype::get_enum_name(reduction),
+      " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::TripletMarginWithDistanceLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginWithDistanceLossFuncOptions& options = {}) {
+  return detail::triplet_margin_with_distance_loss(
+    anchor,
+    positive,
+    negative,
+    options.distance_function(),
+    options.margin(),
+    options.swap(),
+    options.reduction());
+}
+
+// ============================================================================
+
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 inline Tensor ctc_loss(const Tensor& log_probs,
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index d136f9cb7ee9..8c9308864842 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -309,7 +309,7 @@ struct TORCH_API SmoothL1LossImpl : public Cloneable<SmoothL1LossImpl> {
 TORCH_MODULE(SmoothL1Loss);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 /// Creates a criterion that optimizes a multi-class multi-classification
 /// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
 /// and output :math:`y` (which is a 2D `Tensor` of target class indices).
@@ -421,9 +421,9 @@ TORCH_MODULE(MultiLabelSoftMarginLoss);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Creates a criterion that measures the triplet loss given an input
-/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater 
+/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater
 /// than :math:`0`. This is used for measuring a relative similarity between
-/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`, 
+/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`,
 /// `positive examples` and `negative examples` respectively). The
 /// shapes of all input tensors should be :math:`(N, D)`.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginLoss to learn
@@ -461,6 +461,50 @@ struct TORCH_API TripletMarginLossImpl : public Cloneable<TripletMarginLossImpl>
 /// module storage semantics.
 TORCH_MODULE(TripletMarginLoss);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginWithDistanceLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given input
+/// tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+/// positive, and negative examples, respectively); and a nonnegative, real-valued function
+/// ("distance function") used to compute the relationships between the anchor
+/// and positive example ("positive distance") and the anchor and negative
+/// example ("negative distance").
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginWithDistanceLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossImpl : public Cloneable<TripletMarginWithDistanceLossImpl> {
+  explicit TripletMarginWithDistanceLossImpl(
+      TripletMarginWithDistanceLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginWithDistanceLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginWithDistanceLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginWithDistanceLossImpl`.
+/// See the documentation for `TripletMarginWithDistanceLossImpl` class to learn what methods it
+/// provides, and examples of how to use `TripletMarginWithDistanceLoss` with
+/// `torch::nn::TripletMarginWithDistanceLossOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TripletMarginWithDistanceLoss);
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// The Connectionist Temporal Classification loss.
@@ -626,9 +670,9 @@ TORCH_MODULE(NLLLoss);
 struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
   explicit CrossEntropyLossImpl(
       const CrossEntropyLossOptions& options_ = {});
-    
+
   void reset() override;
-    
+
   /// Pretty prints the `CrossEntropyLoss` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
index 04518a177333..6b69f53ecf33 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformercoder.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -32,6 +32,8 @@ namespace nn {
 class TORCH_API TransformerEncoderImpl : public Cloneable<TransformerEncoderImpl> {
 
   public:
+    TransformerEncoderImpl(TransformerEncoderLayer encoder_layer, int64_t num_layers)
+      : TransformerEncoderImpl(TransformerEncoderOptions(encoder_layer, num_layers)) {}
     explicit TransformerEncoderImpl(TransformerEncoderOptions options_);
 
     Tensor forward(
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index 16cdd02aa562..e175aa02294a 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -388,6 +388,51 @@ using TripletMarginLossFuncOptions = TripletMarginLossOptions;
 
 // ============================================================================
 
+/// Options for the `TripletMarginWithDistanceLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossOptions {
+  typedef c10::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum> reduction_t;
+  typedef std::function<Tensor(const Tensor&, const Tensor&)> distance_function_t;
+
+  /// Specifies a nonnegative, real-valued function that quantifies the
+  /// closeness of two tensors. If not specified, `F::pairwise_distance` will
+  /// be used. Default: nullopt
+  TORCH_ARG(c10::optional<distance_function_t>, distance_function) = c10::nullopt;
+  /// Specifies a nonnegative margin representing the minimum difference
+  /// between the positive and negative distances required for the loss to be 0.
+  /// Larger margins penalize cases where the negative examples are not distance
+  /// enough from the anchors, relative to the positives. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Whether to use the distance swap described in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. If True, and if the positive example is closer to the
+  /// negative example than the anchor is, swaps the positive example and the
+  /// anchor in the loss computation. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_with_distance_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginWithDistanceLossFuncOptions = TripletMarginWithDistanceLossOptions;
+} // namespace functional
+
+// ============================================================================
+
 /// Options for the `CTCLoss` module.
 ///
 /// Example:
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 43ab1119def9..4b41b88c420c 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -180,6 +180,33 @@ Tensor TripletMarginLossImpl::forward(
 
 // ============================================================================
 
+TripletMarginWithDistanceLossImpl::TripletMarginWithDistanceLossImpl(
+    TripletMarginWithDistanceLossOptions options_)
+    : options(std::move(options_)) {}
+
+void TripletMarginWithDistanceLossImpl::reset() {}
+
+void TripletMarginWithDistanceLossImpl::pretty_print(std::ostream& stream) const {
+  stream << "torch::nn::TripletMarginWithDistanceLoss(margin=" << options.margin()
+         << std::boolalpha << ", swap=" << options.swap() << ")";
+}
+
+Tensor TripletMarginWithDistanceLossImpl::forward(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative) {
+  return F::detail::triplet_margin_with_distance_loss(
+    anchor,
+    positive,
+    negative,
+    options.distance_function(),
+    options.margin(),
+    options.swap(),
+    options.reduction());
+}
+
+// ============================================================================
+
 MultiLabelMarginLossImpl::MultiLabelMarginLossImpl(
     const torch::nn::MultiLabelMarginLossOptions& options_)
     : options(options_) {}
@@ -223,9 +250,9 @@ void SmoothL1LossImpl::pretty_print(std::ostream& stream) const {
 Tensor SmoothL1LossImpl::forward(const Tensor& input, const Tensor& target) {
   return F::detail::smooth_l1_loss(input, target, options.reduction());
 }
-  
+
 // ============================================================================
-  
+
 CTCLossImpl::CTCLossImpl(const CTCLossOptions& options_) : options(options_) {}
 
 void CTCLossImpl::reset() {}
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 29f0720fb3c7..1314a98e9562 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -87,6 +87,14 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
   return tensor;
 }
 
+Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
+  if (!at::isComplexType(self_st) && gradient_result.is_complex()) {
+    // R -> C
+    return at::real(gradient_result);
+  }
+  return gradient_result;
+}
+
 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim) {
   if (keepdim) {
     return output;
@@ -211,6 +219,17 @@ Tensor mvlgamma_backward(Tensor grad, const Tensor & self, int64_t p) {
   return grad * args.digamma_().sum(-1);
 }
 
+Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
+  if (self.is_complex()) {
+    auto abs = at::abs(self);
+    // C -> C
+    // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
+    return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
+  } else {
+    return at::zeros_like(grad, at::MemoryFormat::Preserve);
+  }
+}
+
 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad * other.conj();
   if (!at::isComplexType(self_st) && result.is_complex()) {
@@ -220,6 +239,24 @@ Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
   return result;
 }
 
+Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
+  auto result = grad / other.conj();
+  if (!at::isComplexType(self_st) && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
+}
+
+Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
+  auto result = -grad * ((self / other) / other).conj();
+  if (!other.is_complex() && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
+}
+
 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
   // invert the permutation
   auto ndims = fwd_dims.size();
@@ -563,7 +600,12 @@ Tensor sum_tensorlist(TensorList tl) {
   return sum;
 }
 
-Tensor repeat_backward(Tensor grad, int64_t input_dims, IntArrayRef repeats) {
+Tensor repeat_backward(Tensor grad, IntArrayRef repeats, IntArrayRef input_shape) {
+  auto find_iter = std::find(repeats.cbegin(), repeats.cend(), 0);
+  if (find_iter != repeats.cend()) {
+    return at::zeros(input_shape, grad.options());
+  }
+  const auto input_dims = input_shape.size();
   int64_t num_unsqueezed = grad.dim() - input_dims;
   for (int64_t i = 0; i < num_unsqueezed; ++i) {
     grad = grad.sum(0, false);
@@ -590,13 +632,12 @@ Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) {
 }
 
 Tensor evenly_distribute_backward(Tensor grad, const Tensor & input, const Tensor & value) {
-  auto mask = (input == value);
-  auto count = mask.sum();
-  auto grad_input = grad / count;
   if (input.is_cuda()) {
-    return mask * grad_input;
+    auto mask = (input == value).logical_or_(input.isnan().logical_and_(value.isnan()));
+    return mask * (grad / mask.sum());
   } else {
-    return at::zeros_like(input).masked_fill_(mask, grad_input);
+    auto mask = value.isnan().item<bool>() ? input.isnan() : input == value;
+    return at::zeros_like(input).masked_fill_(mask, grad / mask.sum());
   }
 }
 
@@ -615,11 +656,11 @@ Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbi
 }
 
 Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) {
-  return var_backward(grad / (result * 2), self, unbiased);
+  return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, unbiased);
 }
 
 Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return var_backward(grad / (result * 2), self, dim, unbiased, keepdim);
+  return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, dim, unbiased, keepdim);
 }
 
 Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) {
@@ -682,15 +723,15 @@ Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) {
   // leads to stable gradient updates, and retains symmetry of the updated matrix if it
   // were updated by a gradient based algorithm.
   if (upper) {
-    L = L.transpose(-1, -2);
-    grad = grad.transpose(-1, -2);
+    L = L.transpose(-1, -2).conj();
+    grad = grad.transpose(-1, -2).conj();
   }
   auto L_inverse = std::get<0>(at::triangular_solve(at::eye(L.size(-1), L.options()), L, /*upper=*/false));
-  auto phi = at::matmul(L.transpose(-1, -2), grad);
+  auto phi = at::matmul(L.transpose(-1, -2).conj(), grad);
   phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5);
 
-  auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2), phi), L_inverse);
-  return grad_input.add(grad_input.transpose(-1, -2)).mul_(0.5);  // Symmetrizing the gradient
+  auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2).conj(), phi), L_inverse);
+  return grad_input.add(grad_input.transpose(-1, -2).conj()).mul_(0.5);  // Symmetrizing the gradient
 }
 
 Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) {
@@ -923,20 +964,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i
   return output;
 }
 
-Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
+  // special case to protect against a divide-by-zero.
+  if (beta == 0) {
+      return at::zeros(grad.sizes(), grad.options());
+  }
   auto d = (input - target).abs();
-  auto grad_input = grad * (d < 1).type_as(grad);
+  auto grad_input = grad * (d < beta).type_as(grad) / beta;
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }
 
-Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
   if (reduction == at::Reduction::None) {
-    return smooth_l1_loss_backward(grad, input, target, reduction);
+    return smooth_l1_loss_backward(grad, input, target, reduction, beta);
   }
-  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);
+  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);
   return (r * grad).sum();
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index b4e7d1667f88..00171cbbf656 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -33,6 +33,7 @@ bool any_variable_defined(variable_list& variables);
 void copy_range(variable_list& out, IndexRange range, const at::Tensor & t);
 void copy_range(variable_list& out, IndexRange range, at::ArrayRef<at::Tensor> t);
 at::Tensor not_implemented(const char* name);
+at::Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result);
 at::Tensor maybe_multiply(const at::Tensor & t, const at::Scalar & s);
 int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim);
 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim);
@@ -44,6 +45,8 @@ at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at:
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result);
 at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
+at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st);
+at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
 at::Tensor mvlgamma_backward(at::Tensor grad, const at::Tensor & self, int64_t p);
 at::Tensor permute_backwards(const at::Tensor & grad, at::IntArrayRef fwd_dims);
 at::Tensor rad2deg_backward(const at::Tensor& grad);
@@ -71,9 +74,10 @@ at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at
 at::Tensor _sparse_addmm_sparse_backward(const at::Tensor& grad, const at::Tensor& sparse_, const at::Tensor& dense, const at::Scalar& alpha);
 at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, at::Scalar p, int64_t dim, at::Scalar maxnorm);
 at::Tensor sum_tensorlist(at::TensorList tl);
-at::Tensor repeat_backward(at::Tensor grad, int64_t input_dims, at::IntArrayRef repeats);
+at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape);
 at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m);
 at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value);
+at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self);
 at::Tensor var_backward(const at::Tensor & grad, const at::Tensor & self, bool unbiased);
 at::Tensor var_backward(at::Tensor grad, const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim);
 at::Tensor std_backward(const at::Tensor & result, const at::Tensor & grad, const at::Tensor & self, bool unbiased);
@@ -101,8 +105,8 @@ at::Tensor log_softmax_double_backward(const at::Tensor & grad, const at::Tensor
 at::Tensor binary_cross_entropy_double_backward(const at::Tensor & grad_output, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor binary_cross_entropy_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
+at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
+at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
 at::Tensor mse_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, int64_t reduction);
 at::Tensor mse_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
 at::Tensor soft_margin_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index c72c67eb5230..9dfc4573188a 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -93,7 +93,7 @@ void backward(
   torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph);
 }
 
-void set_data(const Tensor & self, const Tensor & new_data) {
+void set_data(Tensor & self, const Tensor & new_data) {
   // `var.set_data(new_data)` shallow-copies all non-autograd TensorImpl fields
   // from `new_data` to `var`. It requires that `new_data` and `var` have compatible
   // tensor type.
@@ -160,7 +160,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
   return self.set_requires_grad(_requires_grad);
 }
 
-void retain_grad(const Tensor & self) {
+void retain_grad(Tensor & self) {
   TORCH_CHECK(self.requires_grad(), "can't retain_grad on Tensor that has requires_grad=False");
   if (self.is_leaf()) {  // no-op for leaves
     return;
@@ -269,7 +269,12 @@ Tensor & detach_(Tensor & self) {
                    "of detach_(). Alternatively, create this view with an "
                    "`unsafe_` version of the function that produced it.");
     } else {
-      AT_ERROR("Can't detach views in-place. Use detach() instead");
+      AT_ERROR("If you are using DistributedDataParallel (DDP) for training, "
+               "and gradient_as_bucket_view is set as True, gradients are "
+               "views of DDP buckets, and hence detach_() cannot be called "
+               "on these gradients. To fix this error, please refer to the "
+               "Optimizer.zero_grad() function in torch/optim/optimizer.py "
+               "as the solution.");
     }
   }
   // I think the choice here is conservative.  In principle, doing
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index 692972533adc..2ef1415cc937 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -67,6 +67,19 @@ inline void throw_error_out_requires_grad(const char* name) {
       "but one of the arguments requires grad.");
 }
 
+inline void throw_error_for_complex_autograd(const Tensor& tensor, const char* name) {
+  if (tensor.requires_grad()) {
+    TORCH_CHECK(!tensor.is_complex(), name,
+                " does not support automatic differentiation for outputs with complex dtype.");
+  }
+}
+
+inline void throw_error_for_complex_autograd(const TensorList& tensorlist, const char* name) {
+  for (auto tensor: tensorlist) {
+    throw_error_for_complex_autograd(tensor, name);
+  }
+}
+
 // TODO: Blegh, bare references
 
 inline void rebase_history(Variable& var, std::shared_ptr<Node> grad_fn) {
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index ab02a03279a1..b8756ff1c7b4 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -75,10 +75,6 @@ variable_list run_backward(
   for (size_t i = 0; i < num_tensors; i++) {
     const Variable& output = outputs[i];
     auto gradient_edge = impl::gradient_edge(output);
-    if(output.is_complex()) {
-      TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ",
-                      "gradients for functions we have not fixed yet");
-    }
     TORCH_CHECK(
         gradient_edge.function,
         "element ", i, " of tensors does not require grad and does not have a grad_fn");
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 62ca26e46939..e952b0afc772 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -227,7 +227,7 @@ Engine::~Engine() {
     // Do not wait for termination of global threads on Windows
     // Because CRT terminates DLL threads before calling
     // global object destructors
-#if !defined(_WIN32) || !defined(C10_BUILD_SHARED_LIBS)
+#if !defined(_WIN32) || defined(C10_USE_MSVC_STATIC_RUNTIME)
     std::unique_lock<std::mutex> lk(non_reentrant_device_thread_mutex_);
     while(non_reentrant_device_thread_count_.load() != 0) {
       non_reentrant_device_thread_condvar_.wait(lk);
@@ -513,12 +513,10 @@ void GraphTask::exec_post_processing() {
 }
 
 void GraphTask::set_exception_without_signal(const std::shared_ptr<Node>& fn) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  if (!has_error_.load()) {
+  if (!has_error_.exchange(true)) {
     if (AnomalyMode::is_enabled() && fn) {
       fn->metadata()->print_stack(fn->name());
     }
-    has_error_ = true;
   }
 }
 
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 17d4f5473880..65d94717a84b 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -114,6 +114,10 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
       // We are tracking the parents to track multiple backward operations.
       assign_parent();
     }
+
+    if (profiler::profilerEnabled()) {
+      thread_id_ = at::RecordFunction::currentThreadId();
+    }
   }
 
   explicit Node(edge_list&& next_edges = edge_list())
@@ -129,8 +133,21 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   /// Evaluates the function on the given inputs and returns the result of the
   /// function call.
   variable_list operator()(variable_list&& inputs) {
-    RECORD_FUNCTION(
-        name(), std::vector<c10::IValue>(inputs.begin(), inputs.end()), sequence_nr());
+    // Using RecordFunction to trogger observers in the backward pass
+    at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION);
+    if (guard.active) {
+      // Using sequence number and thread id to correlate with
+      // the forward pass function
+      guard.setForwardThreadId(thread_id_);
+      if (guard.needs_inputs) {
+        guard.before(
+          name(),
+          std::vector<c10::IValue>(inputs.begin(), inputs.end()),
+          sequence_nr());
+      } else {
+        guard.before(name(), sequence_nr());
+      }
+    }
     // In the first iteration of named tensors, autograd ignores names and
     // operates on unnamed tensors. In the long term, autograd should
     // probably operate with names.
@@ -241,6 +258,11 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   // assigning a node as a parent to this node
   void assign_parent();
 
+  /// Id of the thread that created Node
+  uint64_t thread_id() const noexcept {
+    return thread_id_;
+  }
+
   /// Returns the name of the dynamic type of the function, for debugging.
   virtual std::string name() const;
 
@@ -362,6 +384,9 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   // fields.
   const uint64_t sequence_nr_;
 
+  // Id of the thread that created the instance
+  uint64_t thread_id_ = 0;
+
   // Note [Thread Safety on Autograd Node]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Autograd Engine let the owning thread which calls Engine::execute to drive the
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index e1a02dc19fd8..dafd07f64b84 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -161,6 +161,11 @@ struct TORCH_API AccumulateGrad : public Node {
         // valid operation which adds `new_grad` to `variable_grad` in
         // place. `variable_grad` is thus still referring to the same tensor
         // after the operation.
+        // Also DistributedDataParallel(DDP) package relies on grad being
+        // mutated in place for saving peak memory usage. DDP will still
+        // work correctly if it is mutated out of place here, but DDP will
+        // maintain one extra copy of grad tensors in buffer and thus
+        // increase peak memory usage.
         variable_grad += new_grad;
         CHECK_RESULT(variable_grad, variable);
         // ^ We could enforce the contract more aggressively here by writing:
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index c92654cf7815..045a732a2016 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -42,26 +42,35 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("NVTX", ProfilerState::NVTX);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
-      .def(py::init<ProfilerState, bool, bool>());
+      .def(py::init<ProfilerState, bool, bool, bool>());
 
   py::class_<Event>(m, "ProfilerEvent")
       .def("kind", &Event::kind)
       .def("name", [](const Event& e) { return e.name(); })
-      .def("thread_id", &Event::thread_id)
+      .def("thread_id", &Event::threadId)
+      .def("fwd_thread_id", &Event::fwdThreadId)
       .def("device", &Event::device)
-      .def("cpu_elapsed_us", &Event::cpu_elapsed_us)
-      .def("cuda_elapsed_us", &Event::cuda_elapsed_us)
-      .def("has_cuda", &Event::has_cuda)
+      .def("cpu_elapsed_us", &Event::cpuElapsedUs)
+      .def("cuda_elapsed_us", &Event::cudaElapsedUs)
+      .def("has_cuda", &Event::hasCuda)
       .def("shapes", &Event::shapes)
-      .def("cpu_memory_usage", &Event::cpu_memory_usage)
-      .def("cuda_memory_usage", &Event::cuda_memory_usage)
+      .def("cpu_memory_usage", &Event::cpuMemoryUsage)
+      .def("cuda_memory_usage", &Event::cudaMemoryUsage)
       .def("handle", &Event::handle)
-      .def("node_id", &Event::node_id)
+      .def("node_id", &Event::nodeId)
       .def("is_remote", &Event::isRemote)
-      .def("sequence_nr", &Event::sequence_nr);
+      .def("sequence_nr", &Event::sequenceNr)
+      .def("stack", &Event::stack)
+      .def("scope", &Event::scope);
+
+  py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
+    .def(py::init<bool, bool>());
 
   m.def("_enable_profiler", enableProfiler);
-  m.def("_disable_profiler", disableProfiler);
+  m.def(
+      "_disable_profiler",
+      disableProfiler,
+      py::arg("profiler_disable_options") = ProfilerDisableOptions());
   m.def("_profiler_enabled", profilerEnabled);
   m.def("_enable_record_function", [](bool enable) {
     at::enableRecordFunction(enable);
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 049f857f8bbf..5cbb7606e579 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/jit/frontend/code_template.h>
 
+#include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
 #include <ATen/core/op_registration/op_registration.h>
@@ -24,27 +25,46 @@ namespace torch { namespace autograd { namespace profiler {
 
 namespace {
 
-  enum EventIValueIdx {
-    KIND = 0,
-    NAME,
-    THREAD_ID,
-    HANDLE,
-    NODE_ID,
-    CPU_MEM_USAGE,
-    CPU_NS,
-    CUDA_RECORDED,
-    CUDA_MEM_USAGE,
-    CUDA_DEVICE,
-    CUDA_US,
-    NUM_EVENT_IVALUE_IDX // must be last in list
-  };
+enum EventIValueIdx {
+  KIND = 0,
+  NAME,
+  THREAD_ID,
+  HANDLE,
+  NODE_ID,
+  CPU_MEM_USAGE,
+  CPU_NS,
+  CUDA_RECORDED,
+  CUDA_MEM_USAGE,
+  CUDA_DEVICE,
+  CUDA_US,
+  SHAPES,
+  NUM_EVENT_IVALUE_IDX // must be last in list
+};
 
-  enum ProfilerIValueIdx {
-    STATE = 0,
-    REPORT_INPUT_SHAPES,
-    PROFILE_MEMORY,
-    NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
-  };
+enum ProfilerIValueIdx {
+  STATE = 0,
+  REPORT_INPUT_SHAPES,
+  PROFILE_MEMORY,
+  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
+};
+
+  const std::unordered_set<std::string> disable_cuda_profiling = {
+      "aten::view",
+      "aten::t",
+      "aten::transpose",
+      "aten::stride",
+      "aten::empty",
+      "aten::empty_like",
+      "aten::empty_strided",
+      "aten::as_strided",
+      "aten::expand",
+      "aten::resize_",
+      "aten::squeeze",
+      "aten::unsqueeze",
+      "aten::slice",
+      "aten::_unsafe_view",
+      "aten::size"
+      };
 
 CUDAStubs default_stubs;
 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
@@ -116,8 +136,9 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //  - TorchScript functions/methods
 //  - user defined named ranges (see `record_function` python context manager)
 //
-// Profiler setups a pair of callbacks that record profiling events and save them
-// into the thread local profiler struct (ThreadLocalDebugInfo, PROFILER_STATE slot)
+// Profiler setups a pair of callbacks that record profiling events and save
+// them into the thread local profiler struct (ThreadLocalDebugInfo,
+// PROFILER_STATE slot)
 //
 //
 // Thus, the overall logic is:
@@ -142,12 +163,16 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //  - save profiling events into the profiling state
 //
 
+struct FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+
 // Profiler state
-struct ProfilerThreadLocalState
-    : public c10::MemoryReportingInfoBase {
-  explicit ProfilerThreadLocalState(
-      const ProfilerConfig& config)
-    : config_(config), remoteProfiledEvents_{c10::nullopt} {}
+struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
+  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
+      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerThreadLocalState() override = default;
 
   inline const ProfilerConfig& config() const {
@@ -171,9 +196,7 @@ struct ProfilerThreadLocalState
     return result;
   }
 
-  void mark(
-      std::string name,
-      bool include_cuda = true) {
+  void mark(std::string name, bool include_cuda = true) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
@@ -181,17 +204,17 @@ struct ProfilerThreadLocalState
       cuda_stubs->nvtxMarkA(name.c_str());
     } else {
       Event evt(
-        EventKind::Mark,
-        at::StringView(std::move(name)),
-        at::RecordFunction::currentThreadId(),
-        include_cuda && config_.state == ProfilerState::CUDA
-      );
+          EventKind::Mark,
+          at::StringView(std::move(name)),
+          at::RecordFunction::currentThreadId(),
+          include_cuda && config_.state == ProfilerState::CUDA);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
       getEventList().record(std::move(evt));
     }
   }
 
-  void setOrAddRemoteProfiledEvents(std::vector<Event>&& remoteProfiledEvents) {
+  void setOrAddRemoteProfiledEvents(
+      std::vector<Event>&& remoteProfiledEvents) {
     // Lock to serialize access from multiple callback threads.
     std::lock_guard<std::mutex> guard(state_mutex_);
     if (remoteProfiledEvents_) {
@@ -202,31 +225,44 @@ struct ProfilerThreadLocalState
   }
 
   void pushRange(
-      const at::StringView& name,
+      const at::RecordFunction& fn,
+      const bool record_cuda,
       const char* msg = "",
-      int64_t sequence_nr = -1,
-      std::vector<std::vector<int64_t>>&& shapes = {},
-      at::RecordFunctionHandle handle = 0) {
+      std::vector<std::vector<int64_t>>&& shapes = {}) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
     if (config_.state == ProfilerState::NVTX) {
       cuda_stubs->nvtxRangePushA(getNvtxStr(
-          name, msg, sequence_nr, shapes).c_str());
+          fn.name(), msg, fn.seqNr(), shapes).c_str());
     } else {
-      Event evt(EventKind::PushRange,
-          name,
+      Event evt(
+          EventKind::PushRange,
+          fn.name(),
           at::RecordFunction::currentThreadId(),
-          config_.state == ProfilerState::CUDA,
-          handle,
+          record_cuda,
+          fn.handle(),
           std::move(shapes),
           at::RecordFunction::getDefaultNodeId());
-      evt.setSequenceNr(sequence_nr);
+      evt.setSequenceNr(fn.seqNr());
+      evt.setFwdThreadId(fn.forwardThreadId());
+      evt.setScope((uint8_t)fn.scope());
+#ifndef C10_MOBILE
+      // backward nodes source range corresponds to the forward node
+      // TODO: consider using C++ stack trace
+      if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+        auto cs = prepareCallstack(jit::currentCallstack());
+        if (cs.empty()) {
+          cs = prepareCallstack(jit::tracer::pythonCallstack());
+        }
+        evt.setStack(callstackStr(cs));
+      }
+#endif
       getEventList().record(std::move(evt));
     }
   }
 
-  void popRange(uint64_t thread_id, at::RecordFunctionHandle handle) {
+  void popRange(const at::RecordFunction& fn, const bool record_cuda) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
@@ -237,13 +273,14 @@ struct ProfilerThreadLocalState
       // called on a different thread than pushRange
       // As a convention, we put the async pop on the original
       // thread and save current thread id in pop event
-      Event evt(EventKind::PopRange,
+      Event evt(
+          EventKind::PopRange,
           at::StringView(""),
           at::RecordFunction::currentThreadId(),
-          config_.state == ProfilerState::CUDA,
-          handle);
+          record_cuda,
+          fn.handle());
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      getEventList(thread_id).record(std::move(evt));
+      getEventList(fn.threadId()).record(std::move(evt));
     }
   }
 
@@ -256,7 +293,9 @@ struct ProfilerThreadLocalState
   }
 
   void reportMemoryUsage(
-      void* /* unused */, int64_t alloc_size, c10::Device device) override {
+      void* /* unused */,
+      int64_t alloc_size,
+      c10::Device device) override {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       uint64_t thread_id = at::RecordFunction::currentThreadId();
       Event evt(
@@ -274,6 +313,34 @@ struct ProfilerThreadLocalState
   }
 
  private:
+  std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
+    std::vector<FileLineFunc> entries;
+    entries.reserve(cs.size());
+    for (const auto& entry : cs) {
+      auto& range = entry.range;
+      if (range.source()) {
+        auto& src = range.source();
+        if (src && src->filename()) {
+          auto line = src->starting_line_no() +
+              src->lineno_for_offset(range.start());
+          entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
+        }
+      }
+    }
+    return entries;
+  }
+
+  std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
+    std::vector<std::string> cs_str;
+    cs_str.reserve(cs.size());
+    for (const auto& entry : cs) {
+      std::stringstream loc;
+      loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+      cs_str.push_back(loc.str());
+    }
+    return cs_str;
+  }
+
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
@@ -281,8 +348,15 @@ struct ProfilerThreadLocalState
       const std::vector<std::vector<int64_t>>& shapes) const {
     if (sequence_nr >= 0 || shapes.size() > 0) {
       std::stringstream s;
+#ifdef __HIP_PLATFORM_HCC__
+      s << name.str();
+#endif
       if (sequence_nr >= 0) {
+#ifdef __HIP_PLATFORM_HCC__
+        s << msg << sequence_nr;
+#else
         s << name.str() << msg << sequence_nr;
+#endif
       }
       if (shapes.size() > 0) {
         s << ", sizes = [";
@@ -332,7 +406,7 @@ struct ProfilerThreadLocalState
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
       event_lists_map_;
 
-  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled, false, false);
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
 };
@@ -351,6 +425,11 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
+        bool record_cuda =
+            state_ptr->config().state == ProfilerState::CUDA;
+        if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
+          record_cuda = false;
+        }
 
         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (state_ptr->config().report_input_shapes) {
@@ -368,10 +447,9 @@ void pushProfilingCallbacks() {
               inputSizes.emplace_back();
             }
           }
-          state_ptr->pushRange(
-              fn.name(), msg, fn.seqNr(), std::move(inputSizes), fn.handle());
+          state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes));
         } else {
-          state_ptr->pushRange(fn.name(), msg, fn.seqNr(), {}, fn.handle());
+          state_ptr->pushRange(fn, record_cuda, msg);
         }
       },
       [](const at::RecordFunction& fn) {
@@ -379,7 +457,12 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
-        state_ptr->popRange(fn.getStartCallbacksThreadId(), fn.handle());
+        bool record_cuda =
+            state_ptr->config().state == ProfilerState::CUDA;
+        if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
+          record_cuda = false;
+        }
+        state_ptr->popRange(fn, record_cuda);
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
@@ -388,9 +471,6 @@ void pushProfilingCallbacks() {
 
 const int kCUDAWarmupStart = 5;
 
-// temp. workaround for dispatcher ::Profiler key
-thread_local std::vector<std::shared_ptr<at::RecordFunctionGuard>> g_;
-
 } // namespace
 
 void registerCUDAMethods(CUDAStubs* stubs) {
@@ -445,12 +525,10 @@ void enableProfiler(const ProfilerConfig& new_config) {
 
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
-
   auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
   pushProfilingCallbacks();
-  g_.emplace_back(std::make_shared<at::RecordFunctionGuard>());
 
   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
@@ -472,22 +550,31 @@ void enableProfiler(const ProfilerConfig& new_config) {
   state->mark("__start_profile", false);
 }
 
-thread_event_lists disableProfiler() {
+thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
+  auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
+  auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true;
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
-  auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  std::shared_ptr<c10::DebugInfoBase> state;
+  if (cleanupTLSState) {
+    state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  } else {
+    state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE);
+  }
+
   auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
   TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
       "Can't disable profiler when it's not running");
 
-  g_.pop_back();
-  at::removeCallback(state_ptr->callbackHandle());
+  if (cleanupTLSState) {
+    at::removeCallback(state_ptr->callbackHandle());
+  }
 
-  if (state_ptr->config().state == ProfilerState::NVTX) {
+  if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
     return thread_event_lists();
   }
 
   state_ptr->mark("__stop_profile");
-
+  // Note that this will erase the underlying events.
   return state_ptr->consolidate();
 }
 
@@ -516,6 +603,30 @@ void Event::record(bool record_cuda) {
       NUM_EVENT_IVALUE_IDX,
       " elements to reconstruct Event.");
 
+  // Reconstruct input shapes from ivalues.
+  auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
+  TORCH_INTERNAL_ASSERT(
+    shapeListIValue.isList(),
+    "Expected profiler shapes IValue to contain type c10::impl::GenericList."
+  );
+
+  auto shapeList = shapeListIValue.toList();
+  std::vector<std::vector<int64_t>> shapes;
+  shapes.reserve(shapeList.size());
+  for (size_t i = 0 ; i < shapeList.size(); ++i) {
+    std::vector<int64_t> s;
+    auto shapeIValue = shapeList.get(i);
+    TORCH_INTERNAL_ASSERT(
+        shapeIValue.isList(),
+        "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.")
+    auto curShapesList = shapeIValue.toList();
+    s.reserve(curShapesList.size());
+    for (size_t j = 0; j < curShapesList.size(); ++j) {
+      s.emplace_back(curShapesList.get(j).toInt());
+    }
+    shapes.emplace_back(s);
+  }
+
   Event evt(
       static_cast<EventKind>(
           ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind
@@ -523,7 +634,7 @@ void Event::record(bool record_cuda) {
       ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id
       static_cast<at::RecordFunctionHandle>(
           ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle
-      {}, // TODO: record shapes
+      std::move(shapes), // input shapes
       ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id
       true, // is remote
       ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage
@@ -541,22 +652,35 @@ at::IValue Event::toIValue() const {
   eventIValueList.reserve(NUM_EVENT_IVALUE_IDX);
   eventIValueList.emplace_back(static_cast<int64_t>(kind_));
   eventIValueList.emplace_back(std::string(name_.str()));
-  eventIValueList.emplace_back(thread_id_);
+  eventIValueList.emplace_back(static_cast<int64_t>(thread_id_));
   eventIValueList.emplace_back(static_cast<double>(handle_));
   eventIValueList.emplace_back(node_id_);
   eventIValueList.emplace_back(cpu_memory_usage_);
   eventIValueList.emplace_back(cpu_ns_);
   // CUDA event information
-  bool cuda_profiling_enabled = has_cuda();
+  bool cuda_profiling_enabled = hasCuda();
   eventIValueList.emplace_back(cuda_profiling_enabled);
   eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_));
   eventIValueList.emplace_back(device_);
   eventIValueList.emplace_back(cuda_us_);
+  // Shapes
+  c10::impl::GenericList shapesList =
+      c10::impl::GenericList(at::ListType::create(at::IntType::get()));
+  shapesList.reserve(shapes_.size());
+  for (const auto& shape : shapes_) {
+    c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get());
+    s.reserve(shape.size());
+    for (const auto& k : shape) {
+      s.emplace_back(k);
+    }
+    shapesList.emplace_back(s);
+  }
+  eventIValueList.emplace_back(shapesList);
   return at::IValue(eventIValueList);
 }
 
-double Event::cuda_elapsed_us(const Event& e) const {
-  TORCH_CHECK(e.has_cuda() && has_cuda(), "Events were not recorded for CUDA");
+double Event::cudaElapsedUs(const Event& e) const {
+  TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA");
   TORCH_CHECK(
       e.device() == device(),
       c10::str(
@@ -605,22 +729,22 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& e
   bool first = true;
   for (Event* evt : events) {
     if (evt->kind() == "push") {
-      events_map[std::make_pair(evt->handle(), evt->node_id())] = evt;
+      events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
     } else if (evt->kind() == "pop") {
       if (!first) {
         out << ",\n";
       }
       first = false;
-      auto it = events_map.find(std::make_pair(evt->handle(), evt->node_id()));
+      auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId()));
       TORCH_CHECK(it != events_map.end(), "Unmatched pop event");
       Event* evt_start = it->second;
       events_map.erase(it);
 
       jit::TemplateEnv env;
       env.s("name", evt_start->name());
-      env.d("ts", profiler_start->cpu_elapsed_us(*evt_start));
-      env.d("dur", evt_start->cpu_elapsed_us(*evt));
-      env.d("tid", evt_start->thread_id());
+      env.d("ts", profiler_start->cpuElapsedUs(*evt_start));
+      env.d("dur", evt_start->cpuElapsedUs(*evt));
+      env.d("tid", evt_start->threadId());
       out << event_template.format(env);
     }
   }
@@ -639,10 +763,7 @@ RecordProfile::RecordProfile(const std::string& filename)
 }
 
 void RecordProfile::init() {
-  enableProfiler(ProfilerConfig(
-      ProfilerState::CPU,
-      /* report_input_shapes */ false,
-      /* profile_memory */ false));
+  enableProfiler(ProfilerConfig(ProfilerState::CPU));
 }
 
 RecordProfile::~RecordProfile() {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 3f962eff341d..9cfe9ea1fd6e 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -88,6 +88,21 @@ inline int64_t getTime() {
 #endif
 }
 
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
 enum class C10_API_ENUM ProfilerState {
     Disabled,
     CPU, // CPU-only profiling
@@ -98,15 +113,18 @@ enum class C10_API_ENUM ProfilerState {
 struct TORCH_API ProfilerConfig {
   ProfilerConfig(
       ProfilerState state,
-      bool report_input_shapes,
-      bool profile_memory)
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false)
       : state(state),
         report_input_shapes(report_input_shapes),
-        profile_memory(profile_memory) {}
+        profile_memory(profile_memory),
+        with_stack(with_stack) {}
   ~ProfilerConfig();
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;
+  bool with_stack;
 
   // Returns IValues corresponding to ProfilerConfig struct, to be used for
   // serialization.
@@ -203,24 +221,29 @@ struct TORCH_API Event final {
   const char* name() const {
     return name_.str();
   }
-  uint16_t thread_id() const {
+
+  uint64_t threadId() const {
     return thread_id_;
   }
+
   std::vector<std::vector<int64_t>> shapes() const {
     return shapes_;
   }
-  double cpu_elapsed_us(const Event & e) const {
+
+  double cpuElapsedUs(const Event& e) const {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
 
-  double cpu_us() const {
+  double cpuUs() const {
     return cpu_ns_ / (1000.0);
   }
 
-  double cuda_elapsed_us(const Event & e) const;
-  bool has_cuda() const {
+  double cudaElapsedUs(const Event& e) const;
+
+  bool hasCuda() const {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }
+
   int device() const {
     return device_;
   }
@@ -238,11 +261,11 @@ struct TORCH_API Event final {
     }
   }
 
-  int64_t cpu_memory_usage() const {
+  int64_t cpuMemoryUsage() const {
     return cpu_memory_usage_;
   }
 
-  int64_t cuda_memory_usage() const {
+  int64_t cudaMemoryUsage() const {
     return cuda_memory_usage_;
   }
 
@@ -251,7 +274,7 @@ struct TORCH_API Event final {
   }
 
   // Node ID corresponding to this event.
-  int node_id( ) const {
+  int nodeId( ) const {
     return node_id_;
   }
 
@@ -276,16 +299,41 @@ struct TORCH_API Event final {
     sequence_nr_ = sequence_nr;
   }
 
-  int64_t sequence_nr() const {
+  int64_t sequenceNr() const {
     return sequence_nr_;
   }
 
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  void setStack(const std::vector<std::string>& stack) {
+    stack_ = stack;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setFwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  void setScope(uint8_t scope) {
+    scope_ = scope;
+  }
+
  private:
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
   EventKind kind_;
-  uint16_t thread_id_;
+  uint64_t thread_id_;
+  uint64_t fwd_thread_id_;
   at::RecordFunctionHandle handle_ {0};
   std::vector<std::vector<int64_t>> shapes_;
   int64_t cpu_memory_usage_ = 0;
@@ -296,6 +344,9 @@ struct TORCH_API Event final {
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
   int64_t sequence_nr_ = -1;
+
+  std::vector<std::string> stack_;
+  uint8_t scope_;
 };
 
 // a linked-list of fixed sized vectors, to avoid
@@ -341,7 +392,7 @@ using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfiler(const ProfilerConfig&);
-TORCH_API thread_event_lists disableProfiler();
+TORCH_API thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);
 // adds profiledEvents to the current thread local recorded events. Each event
 // will be marked with node ID given by fromNodeId.
 TORCH_API void addEventList(std::vector<Event>&& profiledEvents);
@@ -383,19 +434,27 @@ struct TORCH_API TLSProfilerGuard {
   explicit TLSProfilerGuard(
       const ProfilerConfig& cfg,
       c10::optional<std::function<void(const thread_event_lists&)>>
-          resultCallback = c10::nullopt)
-      : cb_(std::move(resultCallback)) {
+          resultCallback = c10::nullopt,
+      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+          c10::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(std::move(profilerDisableOptions)) {
     enableProfiler(cfg);
   }
   ~TLSProfilerGuard() {
-    thread_event_lists event_lists = disableProfiler();
+    thread_event_lists event_lists = disableProfiler(profilerDisableOptions_);
     if (cb_) {
-      (*cb_)(event_lists);
+      try {
+        (*cb_)(event_lists);
+      } catch (const std::exception& e) {
+        LOG(ERROR) << "Got error processing profiler events: " << e.what();
+      }
     }
   }
 
  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index f4c88225efc8..586e956a8549 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -167,10 +167,6 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar
         "vmapped tensors (output ", i, " is being vmapped over). Please "
         "call autograd.grad() outside torch.vmap or file a bug report "
         "with your use case.")
-    if(variable.is_complex()) {
-      TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ",
-                      "gradients for functions we have not fixed yet");
-    }
     auto gradient_edge = torch::autograd::impl::gradient_edge(variable);
     THPUtils_assert(gradient_edge.function,
         "element %d of tensors does not require grad and does not have a grad_fn", i);
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 81e10a9a1d1b..28f9c3880d88 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -388,19 +388,19 @@ PyObject *THPVariable_get_ndim(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-PyObject *THPVariable_get_names(THPVariable *self, void *unused)
+PyObject *THPVariable_get_names(PyObject *self, void *unused)
 {
   HANDLE_TH_ERRORS
-  if (check_has_torch_function((PyObject *)self)) {
-    return handle_torch_function_getter(self, "names");
+  if (check_has_torch_function(self)) {
+    return handle_torch_function_getter((THPVariable*)self, "names");
   }
   // The long-term plan is to return a list of (python) torch.Dimname.
   // However, for now, return a list of string.
-  size_t size = self->cdata.dim();
+  size_t size = ((THPVariable *)self)->cdata.dim();
   THPObjectPtr tuple(PyTuple_New(size));
   if (!tuple) throw python_error();
 
-  const auto dimnames = self->cdata.names();
+  const auto dimnames = ((THPVariable *)self)->cdata.names();
   for (size_t i = 0; i < size; ++i) {
     PyObject* str;
     if (dimnames[i].type() == at::NameType::WILDCARD) {
@@ -423,12 +423,12 @@ PyObject *THPVariable_get_names(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-int THPVariable_set_names(THPVariable *self, PyObject *names) {
+int THPVariable_set_names(PyObject *self, PyObject *names) {
   HANDLE_TH_ERRORS
-  if (check_has_torch_function((PyObject *)self)) {
-    return handle_torch_function_setter(self, "names", names);
+  if (check_has_torch_function(self)) {
+    return handle_torch_function_setter((THPVariable*)self, "names", names);
   }
-  auto& var = self->cdata;
+  auto& var = ((THPVariable *)self)->cdata;
   if (names == Py_None) {
     at::internal_set_names_inplace(var, at::nullopt);
   } else {
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 403bcb2b85da..35dbeae3f3aa 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -199,7 +199,9 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_broadcast",
         1,
-        "(sequence[Tensor] inputs, int root)");
+        "(sequence[Tensor] inputs, int root"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
@@ -228,7 +230,9 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_all_gather",
         1,
-        "(sequence[Tensor] inputs, sequence[Tensor] outputs");
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
@@ -258,7 +262,9 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_reduce_scatter",
         1,
-        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op");
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp
index efada16a49c8..a8f80a35855d 100644
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@@ -29,7 +29,12 @@ void initCudartBindings(PyObject* module) {
   cudart.def("cuda" "GetErrorString", cudaGetErrorString);
   cudart.def("cuda" "ProfilerStart", cudaProfilerStart);
   cudart.def("cuda" "ProfilerStop", cudaProfilerStop);
-  cudart.def("cuda" "HostRegister", cudaHostRegister);
+  cudart.def("cuda" "HostRegister", [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
+    return cudaHostRegister((void*)ptr, size, flags);
+  });
+  cudart.def("cuda" "HostUnregister", [](uintptr_t ptr) -> cudaError_t {
+    return cudaHostUnregister((void*)ptr);
+  });
 #ifndef __HIP_PLATFORM_HCC__
   cudart.def("cuda" "ProfilerInitialize", cudaProfilerInitialize);
 #endif
diff --git a/torch/csrc/distributed/autograd/init.cpp b/torch/csrc/distributed/autograd/init.cpp
index 9ab16fb6a93c..09de7abb87a5 100644
--- a/torch/csrc/distributed/autograd/init.cpp
+++ b/torch/csrc/distributed/autograd/init.cpp
@@ -15,7 +15,7 @@ namespace {
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* dist_autograd_init(PyObject* /* unused */) {
+PyObject* dist_autograd_init(PyObject* _unused, PyObject* noargs) {
   auto autograd_module =
       THPObjectPtr(PyImport_ImportModule("torch.distributed.autograd"));
   if (!autograd_module) {
@@ -196,7 +196,7 @@ Example::
 
 static PyMethodDef methods[] = { // NOLINT
     {"_dist_autograd_init",
-     (PyCFunction)dist_autograd_init,
+     dist_autograd_init,
      METH_NOARGS,
      nullptr},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index 726cc605a913..464d8248d8a4 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -143,7 +143,8 @@ std::shared_ptr<FutureMessage> sendMessageWithAutograd(
     const WorkerInfo& dst,
     torch::distributed::rpc::Message&& wrappedRpcMsg,
     bool forceGradRecording,
-    const float rpcTimeoutSeconds) {
+    const float rpcTimeoutSeconds,
+    bool forceDisableProfiling) {
   auto msg = getMessageWithAutograd(
       dst.id_,
       std::move(wrappedRpcMsg),
@@ -153,7 +154,7 @@ std::shared_ptr<FutureMessage> sendMessageWithAutograd(
   std::shared_ptr<FutureMessage> fut;
   // If profiler is enabled, wrap this message with profiling metadata that will
   // tell the remote end to process this request with the profiler enabled.
-  if (torch::autograd::profiler::profilerEnabled()) {
+  if (!forceDisableProfiling && torch::autograd::profiler::profilerEnabled()) {
     auto profilerConfig = torch::autograd::profiler::getProfilerConfig();
     auto msgWithProfiling = getMessageWithProfiling(
         std::move(msg),
diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h
index c6316378a146..2a0a066e1a95 100644
--- a/torch/csrc/distributed/autograd/utils.h
+++ b/torch/csrc/distributed/autograd/utils.h
@@ -51,7 +51,8 @@ sendMessageWithAutograd(
     const rpc::WorkerInfo& dst,
     rpc::Message&& wrappedRpcMsg,
     bool forceGradRecording = false,
-    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout);
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    bool forceDisableProfiling = false);
 
 } // namespace autograd
 } // namespace distributed
diff --git a/torch/csrc/distributed/c10d/c10d_frontend.h b/torch/csrc/distributed/c10d/c10d_frontend.h
new file mode 100644
index 000000000000..9ff4b69999c7
--- /dev/null
+++ b/torch/csrc/distributed/c10d/c10d_frontend.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/lib/c10d/ProcessGroup.hpp>
+#include <torch/lib/c10d/Store.hpp>
+#include <ATen/ATen.h>
+#include <c10/util/Optional.h>
+
+#include <string>
+#include <unordered_map>
+#include <memory>
+#include <chrono>
+
+namespace c10d {
+
+class Backend {
+    public:
+     // Maps to Backend.__new__ in Python.
+     static std::string get(std::string);
+
+     // TODO: How to support registering third_party backend?
+     static void registerBackend();
+
+    private:
+     // TODO: Should this be an enum list instead since this set doesn't
+     // change at all.
+     std::unordered_set<std::string> registered_backends_;
+};
+
+class DistributedC10d{
+    public:
+     void initProcessGroup(
+         const std::string& backend,
+         const std::string& init_method,
+         const std::chrono::milliseconds& timeout,
+         int64_t world_size,
+         int64_t rank,
+         std::shared_ptr<Store> store,
+         const std::string& group_name);
+
+     void destroyProcessGroup(std::shared_ptr<ProcessGroup> group);
+     int64_t getRank(std::shared_ptr<ProcessGroup> group);
+     int64_t getWorldSize(std::shared_ptr<ProcessGroup> group);
+
+     ProcessGroup::Work isend(at::Tensor tensor, int64_t dst, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
+     ProcessGroup::Work irecv(at::Tensor tensor, int64_t src, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
+
+    private:
+     DistributedC10d(){};
+
+     bool rankNotInGroup(std::shared_ptr<ProcessGroup> group) const;
+     int64_t getGroupRank(
+         std::shared_ptr<ProcessGroup> group,
+         const int64_t rank) const;
+     int64_t getGlobalRank(
+         std::shared_ptr<ProcessGroup> group,
+         const int64_t global_rank) const;
+     void checkDefaultPg() const;
+     int64_t getGroupSize(std::shared_ptr<ProcessGroup> group) const;
+     int64_t getBackend(std::shared_ptr<ProcessGroup> group);
+
+     std::string backend_;
+     // TODO: Ask Alex what kind of equality we need. It determine whether we
+     // need to use ProcessGroup or ProcesGroup* as key.
+     std::unordered_map<
+         std::shared_ptr<ProcessGroup>,
+         std::pair<std::shared_ptr<Backend>, std::shared_ptr<Store>>>
+         pg_map_;
+
+     // Note, this is different mapping relationship than original Python
+     // implementation.
+     std::unordered_map<std::shared_ptr<ProcessGroup>, std::string> pg_names_;
+
+     // Value is global_rank:group_rank mapping.
+     std::unordered_map<std::shared_ptr<ProcessGroup>, std::vector<int64_t>>
+         pg_group_ranks_;
+
+     std::shared_ptr<ProcessGroup> default_pg_;
+
+     // Default value should be "env://"
+     std::string default_pg_init_method_;
+
+     int64_t group_count_;
+};
+
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index e2b501f08aff..2eb626c40232 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -38,7 +38,7 @@ class GradBucket {
 // DDP's c10d reducer allows communication hooks defined as a sub class
 // of CommHookInterface. CommHookInterface is an abstract class and can
 // be used to implement both Python and CPP hooks.
-struct TORCH_API CommHookInterface {
+struct TORCH_PYTHON_API CommHookInterface {
  public:
   virtual ~CommHookInterface() {}
 
@@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface {
 
 // PythonCommHook enables registering a python hook to c10d reducer and is a
 // sub class of CommHookInterface.
-class TORCH_API PythonCommHook : public CommHookInterface {
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
  public:
   // The constructor takes a state and a callable hook. Inputs are Python
   // objects. The state is passed to the hook in runHook function can be used to
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index aff2da31c133..d15ea9d23412 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,7 +1,11 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10d/FileStore.hpp>
+#ifndef _WIN32
 #include <c10d/HashStore.hpp>
+#include <c10d/TCPStore.hpp>
+#include <c10d/ProcessGroupRoundRobin.hpp>
+#endif
 #include <c10d/ProcessGroup.hpp>
 
 #ifdef USE_C10D_GLOO
@@ -17,8 +21,6 @@
 #endif
 
 #include <c10d/PrefixStore.hpp>
-#include <c10d/ProcessGroupRoundRobin.hpp>
-#include <c10d/TCPStore.hpp>
 #include <pybind11/chrono.h>
 
 #include <torch/csrc/Exceptions.h>
@@ -92,6 +94,14 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, add, key, value);
   }
 
+  int64_t getNumKeys() override {
+    PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys);
+  }
+
+  bool deleteKey(const std::string& key) override {
+    PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, deleteKey, key);
+  }
+
   bool check(const std::vector<std::string>& keys) override {
     PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys);
   }
@@ -121,7 +131,7 @@ void _register_comm_hook(
       std::move(state), std::move(comm_hook)));
 };
 
-PyObject* c10d_init(PyObject* _unused) {
+PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   C10_LOG_API_USAGE_ONCE("c10d.python.import");
   auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
@@ -159,6 +169,7 @@ PyObject* c10d_init(PyObject* _unused) {
               std::shared_ptr<::c10d::ProcessGroup>,
               std::vector<std::vector<bool>>,
               int64_t,
+              bool,
               bool>(),
           py::arg("replicas"),
           py::arg("bucket_indices"),
@@ -166,6 +177,7 @@ PyObject* c10d_init(PyObject* _unused) {
           py::arg("expect_sparse_gradients") = std::vector<std::vector<bool>>(),
           py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap,
           py::arg("find_unused_parameters") = false,
+          py::arg("gradient_as_bucket_view") = false,
           py::call_guard<py::gil_scoped_release>())
       .def(
           "initialize_buckets",
@@ -272,7 +284,12 @@ They are used in specifying strategies for reduction collectives, e.g.,
 
   auto store =
       py::class_<::c10d::Store, std::shared_ptr<::c10d::Store>, PythonStore>(
-          module, "Store")
+          module, "Store",
+          R"(
+Base class for all store implementations, such as the 3 provided by PyTorch
+distributed: (:class:`~torch.distributed.TCPStore`, :class:`~torch.distributed.FileStore`,
+and :class:`~torch.distributed.HashStore`).
+)")
           // Default constructor.
           .def(py::init<>())
           // Convert from std::string to std::vector<uint8>.
@@ -284,7 +301,23 @@ They are used in specifying strategies for reduction collectives, e.g.,
                 std::vector<uint8_t> value_(value.begin(), value.end());
                 store.set(key, value_);
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Inserts the key-value pair into the store based on the supplied ``key`` and
+``value``. If ``key`` already exists in the store, it will overwrite the old
+value with the new supplied ``value``.
+
+Arguments:
+    key (str): The key to be added to the store.
+    value (str): The value associated with ``key`` to be added to the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # Should return "first_value"
+    >>> store.get("first_key")
+)")
           // Convert from std::vector<uint8_t> to py::bytes.
           // The returned value is not guaranteed to be valid UTF-8.
           .def(
@@ -294,21 +327,141 @@ They are used in specifying strategies for reduction collectives, e.g.,
                 return py::bytes(
                     reinterpret_cast<char*>(value.data()), value.size());
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
+present in the store, the function will wait for ``timeout``, which is defined
+when initializing the store, before throwing an exception.
+
+Arguments:
+    key (str): The function will return the value associated with this key.
+
+Returns:
+    Value associated with ``key`` if ``key`` is in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # Should return "first_value"
+    >>> store.get("first_key")
+)")
           .def(
               "add",
               &::c10d::Store::add,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+The first call to add for a given ``key`` creates a counter associated
+with ``key`` in the store, initialized to ``amount``. Subsequent calls to add
+with the same ``key`` increment the counter by the specified ``amount``.
+Calling :meth:`~torch.distributed.store.add` with a key that has already
+been set in the store by :meth:`~torch.distributed.store.set` will result
+in an exception.
+
+Arguments:
+    key (str): The key in the store whose counter will be incremented.
+    amount (int): The quantity by which the counter will be incremented.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.add("first_key", 1)
+    >>> store.add("first_key", 6)
+    >>> # Should return 7
+    >>> store.get("first_key")
+)")
+          .def(
+              "delete_key",
+              &::c10d::Store::deleteKey,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Deletes the key-value pair associated with ``key`` from the store. Returns
+`true` if the key was successfully deleted, and `false` if it was not.
+
+.. warning::
+    The ``delete_key`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API
+    with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception.
+
+Arguments:
+    key (str): The key to be deleted from the store
+
+Returns:
+    `true` if ``key`` was deleted, otherwise `false`.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key")
+    >>> # This should return true
+    >>> store.delete_key("first_key")
+    >>> # This should return false
+    >>> store.delete_key("bad_key")
+)")
+          .def(
+              "num_keys",
+              &::c10d::Store::getNumKeys,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Returns the number of keys set in the store. Note that this number will typically
+be one greater than the number of keys added by :meth:`~torch.distributed.store.set`
+and :meth:`~torch.distributed.store.add` since one key is used to coordinate all
+the workers using the store.
+
+.. warning::
+    The ``num_keys`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API
+    with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception.
+
+Returns:
+    The number of keys present in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # This should return 2
+    >>> store.num_keys()
+)")
           .def(
               "set_timeout",
               &::c10d::Store::setTimeout,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Sets the store's default timeout. This timeout is used during initialization and in
+:meth:`~torch.distributed.store.wait` and :meth:`~torch.distributed.store.get`.
+
+Arguments:
+    timeout (timedelta): timeout to be set in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set_timeout(timedelta(seconds=10))
+    >>> # This will throw an exception after 10 seconds
+    >>> store.wait(["bad_key"])
+)")
           .def(
               "wait",
               [](::c10d::Store& store, const std::vector<std::string>& keys) {
                 store.wait(keys);
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Waits for each key in ``keys`` to be added to the store. If not all keys are
+set before the ``timeout`` (set during store initialization), then ``wait``
+will throw an exception.
+
+Arguments:
+    keys (list): List of keys on which to wait until they are set in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> # This will throw an exception after 30 seconds
+    >>> store.wait(["bad_key"])
+)")
           .def(
               "wait",
               [](::c10d::Store& store,
@@ -316,15 +469,79 @@ They are used in specifying strategies for reduction collectives, e.g.,
                  const std::chrono::milliseconds& timeout) {
                 store.wait(keys, timeout);
               },
-              py::call_guard<py::gil_scoped_release>());
-
-  shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Waits for each key in ``keys`` to be added to the store, and throws an exception
+if the keys have not been set by the supplied ``timeout``.
+
+Arguments:
+    keys (list): List of keys on which to wait until they are set in the store.
+    timeout (timedelta): Time to wait for the keys to be added before throwing an exception.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> # This will throw an exception after 10 seconds
+    >>> store.wait(["bad_key"], timedelta(seconds=10))
+)");
+
+  shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store,
+      R"(
+A store implementation that uses a file to store the underlying key-value pairs.
+
+Arguments:
+    file_name (str): path of the file in which to store the key-value pairs
+    world_size (int): The total number of processes using the store
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store1 = dist.FileStore("/tmp/filestore", 2)
+    >>> store2 = dist.FileStore("/tmp/filestore", 2)
+    >>> # Use any of the store methods from either the client or server after initialization
+    >>> store1.set("first_key", "first_value")
+    >>> store2.get("first_key")
+
+      )")
       .def(py::init<const std::string&, int>());
 
-  shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store)
+#ifndef _WIN32
+  shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store,
+      R"(
+A thread-safe store implementation based on an underlying hashmap. This store can be used
+within the same process (for example, by other threads), but cannot be used across processes.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.HashStore()
+    >>> # store can be used from other threads
+    >>> # Use any of the store methods after initialization
+    >>> store.set("first_key", "first_value")
+      )")
       .def(py::init<>());
 
-  shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store)
+  shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store,
+      R"(
+A TCP-based distributed key-value store implementation. The server store holds
+the data, while the client stores can connect to the server store over TCP and
+perform actions such as :meth:`~torch.distributed.store.set` to insert a key-value
+pair, :meth:`~torch.distributed.store.get` to retrieve a key-value pair, etc.
+
+Arguments:
+    host_name (str): The hostname or IP Address the server store should run on.
+    port (int): The port on which the server store should listen for incoming requests.
+    world_size (int): The total number of store users (number of clients + 1 for the server).
+    is_master (bool): True when initializing the server store, False for client stores.
+    timeout (timedelta): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> server_store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> client_store = dist.TCPStore("127.0.0.1", 0, false)
+    >>> # Use any of the store methods from either the client or server after initialization
+    >>> server_store.set("first_key", "first_value")
+    >>> client_store.get("first_key")
+      )")
       .def(
           py::init<
               const std::string&,
@@ -338,8 +555,18 @@ They are used in specifying strategies for reduction collectives, e.g.,
           py::arg("is_master"),
           py::arg("timeout") =
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
+#endif
 
-  shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store)
+  shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store,
+      R"(
+A wrapper around any of the 3 key-value stores (:class:`~torch.distributed.TCPStore`,
+:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`)
+that adds a prefix to each key inserted to the store.
+
+Arguments:
+    prefix (str): The prefix string that is prepended to each key before being inserted into the store.
+    store (torch.distributed.store): A store object that forms the underlying key-value store.
+      )")
       .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
 
   auto processGroup =
@@ -605,6 +832,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
               py::arg("opts") = ::c10d::BarrierOptions(),
               py::call_guard<py::gil_scoped_release>());
 
+#ifndef _WIN32
   module.def(
       "_round_robin_process_groups",
       [](std::vector<std::shared_ptr<::c10d::ProcessGroup>> processGroups)
@@ -618,6 +846,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
       },
       py::arg("process_groups"),
       py::call_guard<py::gil_scoped_release>());
+#endif
 
 #ifdef USE_C10D_GLOO
   auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>(
@@ -720,6 +949,12 @@ They are used in specifying strategies for reduction collectives, e.g.,
       .def(py::init<>())
       .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream)
       .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout);
+  processGroupNCCL.def_static("_group_start", []() {
+    ::c10d::ProcessGroupNCCL::groupStart();
+  });
+  processGroupNCCL.def_static("_group_end", []() {
+    ::c10d::ProcessGroupNCCL::groupEnd();
+  });
 #endif
 
 #ifdef USE_C10D_MPI
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index ac4e735af94a..90128e48ee1d 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -32,7 +32,8 @@ Reducer::Reducer(
     std::shared_ptr<c10d::ProcessGroup> process_group,
     std::vector<std::vector<bool>> expect_sparse_gradients,
     int64_t bucket_bytes_cap,
-    bool find_unused_parameters)
+    bool find_unused_parameters,
+    bool gradient_as_bucket_view)
     : replicas_(std::move(replicas)),
       process_group_(std::move(process_group)),
       expect_sparse_gradients_(std::move(expect_sparse_gradients)),
@@ -41,6 +42,7 @@ Reducer::Reducer(
       next_bucket_(0),
       has_marked_unused_parameters_(false),
       find_unused_parameters_(find_unused_parameters),
+      gradient_as_bucket_view_(gradient_as_bucket_view),
       local_used_maps_reduced_(false),
       backward_stats_base_(0),
       has_rebuilt_bucket_(false),
@@ -87,10 +89,7 @@ Reducer::Reducer(
       for (size_t variable_index = 0; variable_index < variable_count;
            variable_index++) {
         auto& variable = replicas_[replica_index][variable_index];
-        const auto index = VariableIndex{
-            .replica_index = replica_index,
-            .variable_index = variable_index,
-        };
+        const auto index = VariableIndex(replica_index, variable_index);
 
         // The gradient accumulator function is lazily initialized once.
         // Therefore we can use its presence in the autograd graph as
@@ -98,15 +97,19 @@ Reducer::Reducer(
         auto grad_accumulator =
             torch::autograd::impl::grad_accumulator(variable);
 
+#ifndef _WIN32
         using torch::distributed::autograd::ThreadLocalDistAutogradContext;
+#endif
         // Hook to execute after the gradient accumulator has executed.
         hooks_.emplace_back(
             grad_accumulator->add_post_hook(
                 torch::make_unique<torch::autograd::utils::LambdaPostHook>(
                     [=](const torch::autograd::variable_list& outputs,
                         const torch::autograd::variable_list& /* unused */) {
+#ifndef _WIN32
                       this->rpc_context_.set(
                           ThreadLocalDistAutogradContext::getContextPtr());
+#endif
                       this->autograd_hook(index);
                       return outputs;
                     })),
@@ -190,7 +193,7 @@ Reducer::Reducer(
 // used to override how DDP communicates gradients across ranks, this can be
 // used for algorithms like Gradient Compression/GossipGrad. This hook can be
 // registered from Python API using `register_comm_hook`. `PythonCommHook`
-// enables registering a Python hook and is a sub class of `CommHookInterface`.
+// enables registering a Python hook and is a subclass of `CommHookInterface`.
 // `CommHookInterface` can be used to implement CPP hooks in the future.
 
 Reducer::~Reducer() noexcept(false) {
@@ -310,6 +313,56 @@ void Reducer::verify_replica0_across_processes() {
   }
 }
 
+void Reducer::check_grad_layout(
+    const at::Tensor& grad,
+    const at::Tensor& bucket_view) {
+  // Ensure that the gradient type matches the bucket type.
+  TORCH_CHECK(
+      grad.options().type_equal(bucket_view.options()),
+      "Expected ",
+      bucket_view.toString(),
+      ", got ",
+      grad.toString());
+  TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device());
+  TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel());
+  // AccumulateGrad doesn't HAVE to obey the grad layout contract.
+  // The penalty for disobedience is reduced performance, not numerical
+  // death. Warnings here help diagnose poor DDP performance.
+  if (grad.strides() != bucket_view.strides()) {
+    TORCH_WARN_ONCE(
+        "Grad strides do not match bucket view strides. "
+        "This may indicate grad was not created according to the "
+        "gradient layout contract, or that the param's strides "
+        "changed since DDP was constructed.  This is not an error, "
+        "but may impair performance.\n"
+        "grad.sizes() = ",
+        grad.sizes(),
+        ", strides() = ",
+        grad.strides(),
+        "\n",
+        "bucket_view.sizes() = ",
+        bucket_view.sizes(),
+        ", strides() = ",
+        bucket_view.strides());
+  }
+  if (!gradient_as_bucket_view_) {
+    TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view));
+  }
+}
+
+void Reducer::copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view) {
+  // See Note [DDP Communication Hook]
+  if (comm_hook_ == nullptr) {
+    // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
+    auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_);
+    wrapped.unsafeGetTensorImpl()->set_wrapped_number(true);
+    // Divides while copying into the bucket view.
+    at::native::mul_out(bucket_view, grad, wrapped);
+  } else {
+    bucket_view.copy_(grad);
+  }
+}
+
 void Reducer::mark_variable_ready_dense(VariableIndex index) {
   const auto replica_index = index.replica_index;
   const auto variable_index = index.variable_index;
@@ -327,49 +380,27 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
   // of the bucket it would otherwise hold.
   runGradCallbackForVariable(variable, [&](auto& grad) {
     if (grad.defined()) {
-      // Ensure that the gradient type matches the bucket type.
-      TORCH_CHECK(
-          grad.options().type_equal(bucket_view.options()),
-          "Expected ",
-          bucket_view.toString(),
-          ", got ",
-          grad.toString());
-      // Assert that the grad tensor and the bucket don't share storage.
-      // If they did, we could avoid the copy altogether.
-      // The reason for not doing this is that existing code calls
-      // `detach_` from `zero_grad`, which is incompatible with views.
-      TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view));
-      TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device());
-      TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel());
-      // AccumulateGrad doesn't HAVE to obey the grad layout contract.
-      // The penalty for disobedience is reduced performance, not numerical
-      // death. Warnings here help diagnose poor DDP performance.
-      if (grad.strides() != bucket_view.strides()) {
-        TORCH_WARN_ONCE(
-            "Grad strides do not match bucket view strides. "
-            "This may indicate grad was not created according to the "
-            "gradient layout contract, or that the param's strides "
-            "changed since DDP was constructed.  This is not an error, "
-            "but may impair performance.\n"
-            "grad.sizes() = ",
-            grad.sizes(),
-            ", strides() = ",
-            grad.strides(),
-            "\n",
-            "bucket_view.sizes() = ",
-            bucket_view.sizes(),
-            ", strides() = ",
-            bucket_view.strides());
-      }
-      // See Note [DDP Communication Hook]
-      if (comm_hook_ == nullptr) {
-        // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
-        auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_);
-        wrapped.unsafeGetTensorImpl()->set_wrapped_number(true);
-        // Divides while copying into the bucket view.
-        at::native::mul_out(bucket_view, grad, wrapped);
+      this->check_grad_layout(grad, bucket_view);
+      // When gradient_as_bucket_view_ is false, or even when
+      // gradient_as_bucket_view_ is true, in rare cases users may set grad to
+      // be None after every iteration. In these cases, grad and bucket_view are
+      // pointing to different storages and thus need to copy grads to
+      // bucket_view. If gradient_as_bucket_view_ is set as true, let grad point
+      // to bucket_view. If grad has already been set as views of buckets in
+      // previous iterations, no copy is needed.
+      if (!grad.is_alias_of(bucket_view)) {
+        this->copy_grad_to_bucket(grad, bucket_view);
+        if (gradient_as_bucket_view_) {
+          // Let grad point to bucket_view buffer.
+          grad = bucket_view;
+          // The grad is modified and need to be written back.
+          return true;
+        }
       } else {
-        bucket_view.copy_(grad);
+        // If grad and bucket view point to the same storage, no need to copy
+        if (comm_hook_ == nullptr) {
+          bucket_view.div_(divFactor_);
+        }
       }
     } else {
       bucket_view.zero_();
@@ -425,11 +456,9 @@ std::vector<std::vector<at::Tensor>> Reducer::get_bucket_tensors() const {
 
 void Reducer::set_forward_pass_work_handle(
     std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
-    at::Tensor& tensor,
     bool useStaticWorldSize) {
   std::lock_guard<std::mutex> lock(mutex_);
   forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle);
-  forwardPassWorkHandle_.resultTensor = tensor;
   forwardPassWorkHandle_.useStaticWorldSize = useStaticWorldSize;
 }
 
@@ -449,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() {
     const auto variable_count = replicas_[replica_index].size();
     for (size_t variable_index = 0; variable_index < variable_count;
          ++variable_index) {
-      const auto index = VariableIndex{
-          .replica_index = replica_index,
-          .variable_index = variable_index,
-      };
+      const auto index = VariableIndex(replica_index, variable_index);
       push_rebuilt_params(index);
     }
   }
@@ -495,7 +521,7 @@ void Reducer::autograd_hook(VariableIndex index) {
   // rebuilt_param_indices_ based on gradient arriving order, and then at the
   // end of finalize_backward(), buckets will be rebuilt based on
   // rebuilt_params_ and rebuilt_param_indices_, and then will be broadcasted
-  // and initialized. Also we only need to dump tensors and parameter indcies of
+  // and initialized. Also we only need to dump tensors and parameter indices of
   // one replica.
   push_rebuilt_params(index);
 
@@ -573,12 +599,13 @@ void Reducer::mark_variable_ready(VariableIndex index) {
   if (divFactor_ == kUnsetDivFactor) {
     divFactor_ = process_group_->getSize();
     auto& workHandle = forwardPassWorkHandle_.workHandle;
-    if (workHandle) {
-      if (!forwardPassWorkHandle_.useStaticWorldSize) {
-        workHandle->wait();
-        at::Tensor& res = forwardPassWorkHandle_.resultTensor;
-        divFactor_ = res.item().to<int>();
-      }
+    if (workHandle && !forwardPassWorkHandle_.useStaticWorldSize) {
+      workHandle->wait();
+      auto results = workHandle->result();
+      // Guard against the results being empty
+      TORCH_INTERNAL_ASSERT(results.size() > 0);
+      at::Tensor& res = results.front();
+      divFactor_ = res.item().to<int>();
     }
   }
 
@@ -675,6 +702,19 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
 
 void Reducer::initialize_buckets(
     std::vector<std::vector<size_t>> bucket_indices) {
+  // If initialize_buckets is called inside DDP constructor, then
+  // it does not matter rpc context ptr is nullptr or not, as grad
+  // will not be mutated.
+  // If initialize_buckets is called during training loop, e.g, inside
+  // rebuild_buckets(), since grad could be mutated and be pointed to
+  // bucket_view, then it needs to check rpc context ptr is nullptr or not,
+  // If rpc context ptr is nullptr, mutate variable.grad(); otherwise,
+  // mutate grad in rpc context.
+#ifndef _WIN32
+  using torch::distributed::autograd::ThreadLocalDistAutogradContext;
+  this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr());
+#endif
+
   // This shouldn't be called if we're expecting autograd hooks to fire.
   TORCH_CHECK(
       !expect_autograd_hooks_,
@@ -810,10 +850,8 @@ void Reducer::initialize_buckets(
       TORCH_CHECK(
           variable_index < variable_locators_.size(),
           "Out of range variable index specified.");
-      variable_locators_[variable_index] = VariableLocator{
-          .bucket_index = bucket_index,
-          .intra_bucket_index = intra_bucket_index++,
-      };
+      variable_locators_[variable_index] = VariableLocator(
+        bucket_index, intra_bucket_index++);
     }
     bucket.variable_indices = std::move(bucket_indices[bucket_index]);
 
@@ -826,7 +864,7 @@ void Reducer::initialize_bucket_views(
     Reducer::BucketReplica& replica,
     at::Tensor& contents) {
   for (size_t i = 0; i < replica.variables.size(); i++) {
-    const auto& v = replica.variables[i];
+    auto& v = replica.variables[i];
     const auto offset = replica.offsets[i];
     const auto length = replica.lengths[i];
     if (v.is_non_overlapping_and_dense()) {
@@ -845,6 +883,29 @@ void Reducer::initialize_bucket_views(
     // By default `bucket_views_out` and `bucket_views_in` are
     // essentially the same thing.
     replica.bucket_views_out = replica.bucket_views_in;
+
+    // If gradient_as_bucket_view_ is set as true, then there are two cases to
+    // handle: initialize_bucket_views could be called inside initialize_buckets
+    // when rebuild_buckets, if grad has already been defined/calculated in
+    // previous iteration, old grad needs to be copied into new bucket_view and
+    // let grad point to the new bucket_view, initialize_bucket_views could also
+    // be called inside initialize_buckets during construction. Grads are not
+    // defined during construction time, in this case, do not let grad point to
+    // bucket_view, because grads should be kept as being undefined for globally
+    // unused parameters.
+    if (gradient_as_bucket_view_) {
+      auto& bucket_view = replica.bucket_views_in.back();
+      runGradCallbackForVariable(v, [&](auto& grad) {
+        if (grad.defined() && !grad.is_alias_of(bucket_view)) {
+          bucket_view.copy_(grad);
+          grad = bucket_view;
+          // The grad is modefied and needs to be written back.
+          return true;
+        }
+        // The grad is not modified and does not need to be written back.
+        return false;
+      });
+    }
   }
 }
 
@@ -966,6 +1027,31 @@ void Reducer::prepare_for_backward(
   }
 }
 
+void Reducer::copy_bucket_to_grad(
+    torch::autograd::Variable& variable,
+    Reducer::BucketReplica& replica,
+    size_t intra_bucket_index,
+    bool global_unused) {
+  const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
+  runGradCallbackForVariable(variable, [&](auto& grad) {
+    // If a parameter is globally unused, we keep its grad untouched.
+    if (!global_unused) {
+      if (!grad.defined()) {
+        // Creates grad according to the "Gradient Layout Contract"
+        // (see torch/csrc/grad/AccumulateGrad.h)
+        grad =
+            torch::autograd::utils::clone_obey_contract(bucket_view, variable);
+      } else {
+        grad.copy_(bucket_view);
+      }
+      // The grad is modified and needs to be written back.
+      return true;
+    }
+    // The grad is not modified.
+    return false;
+  });
+}
+
 // A bucket with one or more dense tensors needs to be unflattened.
 void Reducer::finalize_bucket_dense(Bucket& bucket) {
   for (size_t replica_index = 0; replica_index < bucket.replicas.size();
@@ -1016,24 +1102,52 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
         }
       }
 
-      const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
-      runGradCallbackForVariable(variable, [&](auto& grad) {
-        // If a parameter is globally unused, we keep its grad untouched.
-        if (!global_unused) {
-          if (!grad.defined()) {
-            // Creates grad according to the "Gradient Layout Contract"
-            // (see torch/csrc/grad/AccumulateGrad.h)
-            grad = torch::autograd::utils::clone_obey_contract(
-                bucket_view, variable);
-          } else {
-            grad.copy_(bucket_view);
-          }
-          // The grad is modified and needs to be written back.
-          return true;
+      if (!gradient_as_bucket_view_) {
+        copy_bucket_to_grad(
+            variable, replica, intra_bucket_index, global_unused);
+      } else {
+        const auto& bucket_view_out =
+            replica.bucket_views_out[intra_bucket_index];
+        auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index];
+        // If communication_hook is registered, bucket_view_out stores
+        // allreduced results in a newly allocated tensor, copy bucket_view_out
+        // back to bucket_view_in that referring to replica.content tensor and
+        // grad.
+        if (!bucket_view_in.is_alias_of(bucket_view_out)) {
+          bucket_view_in.copy_(bucket_view_out);
         }
-        // The grad is not modified.
-        return false;
-      });
+        runGradCallbackForVariable(variable, [&](auto& grad) {
+          // If a parameter is globally unused, we keep its grad untouched.
+          if (!global_unused) {
+            // If grad is globally used but locally unused, let grad point to
+            // bucket_view_in
+            if (!grad.defined()) {
+              grad = bucket_view_in;
+            } else {
+              if (!grad.is_alias_of(bucket_view_in)) {
+                grad.copy_(bucket_view_in);
+                TORCH_WARN_ONCE(
+                    "Detected at least one parameter gradient is not the "
+                    "expected DDP bucket view when setting "
+                    "gradient_as_bucket_view=True. This can happen when "
+                    "multiple parameters sharing the same gradient. For "
+                    "example, param0 and param1 share the same gradient "
+                    "grad0. In this case, grad0 would first point to "
+                    "bucket_view_in0 when param0 is ready. Later, when "
+                    "param1 is ready, it will override grad0 to point to "
+                    "bucket_view_in1. However, param0 still expects grad0 "
+                    "to point to bucket_view_in0, and hence hit this "
+                    "warning. If you saw this message, please double-check if "
+                    "the above situation is expected for your application.");
+              }
+            }
+            // The grad is modified and needs to be written back.
+            return true;
+          }
+          // The grad is not modified.
+          return false;
+        });
+      }
     }
   }
 }
@@ -1119,7 +1233,9 @@ void Reducer::runGradCallbackForVariable(
     cb(variable.mutable_grad());
   } else {
     // Under distributed autograd
+#ifndef _WIN32
     context_ptr->runGradCallbackForVariable(variable, std::move(cb));
+#endif
   }
 }
 
@@ -1225,8 +1341,9 @@ bool Reducer::rebuild_buckets() {
       replicas_[0].size() == rebuilt_param_indices_.size(),
       c10::str(
           "rebuilt parameter indices size is not same as original model parameters size.",
+          "Original model param size is: ",
           replicas_[0].size(),
-          " versus ",
+          " versus rebuilt params size of: ",
           rebuilt_param_indices_.size()));
   std::vector<std::vector<size_t>> rebuilt_bucket_indices;
   std::vector<size_t> bucket_size_limits;
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 87ad60330af7..5a17dbe6f1c2 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -30,7 +30,8 @@ class Reducer {
       std::shared_ptr<c10d::ProcessGroup> process_group,
       std::vector<std::vector<bool>> expect_sparse_gradients,
       int64_t bucket_bytes_cap,
-      bool find_unused_parameters);
+      bool find_unused_parameters,
+      bool gradient_as_bucket_view);
 
   ~Reducer() noexcept(false);
 
@@ -54,7 +55,7 @@ class Reducer {
     return backward_stats_;
   }
 
-  // Registeres a hook to the reducer. The hook is `CommHookInterface`
+  // Registers a hook to the reducer. The hook is `CommHookInterface`
   // type to allow both Python and CPP hooks. This function can only
   // be called once before calling backward.
   void register_comm_hook(std::unique_ptr<CommHookInterface> iface);
@@ -89,7 +90,6 @@ class Reducer {
   // corresponding tensor being reduced.
   void set_forward_pass_work_handle(
       std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
-      at::Tensor& tensor,
       bool useStaticWorldSize);
 
   // Retrieve on-device tensors used to track locally unused parameters. For
@@ -104,6 +104,13 @@ class Reducer {
   struct VariableIndex {
     size_t replica_index;
     size_t variable_index;
+
+    VariableIndex() = default;
+
+    VariableIndex(size_t replica_index_, size_t variable_index_) {
+      replica_index = replica_index_;
+      variable_index = variable_index_;
+    }
   };
 
   void push_rebuilt_params(const VariableIndex& index);
@@ -125,6 +132,7 @@ class Reducer {
 
   bool has_marked_unused_parameters_;
   const bool find_unused_parameters_;
+  const bool gradient_as_bucket_view_;
   std::vector<VariableIndex> unused_parameters_;
   // Locally used parameter maps indicating if parameters are used locally
   // during the current iteration or no_sync session if no_sync is on. One
@@ -180,7 +188,7 @@ class Reducer {
   // and on the same device can be batched. The tensor that represents the
   // flattened gradient uses the same type and is placed on the same device.
   // Buckets are filled as the gradients they hold are computed (triggered by
-  // autograd hooks). Buckets are reduced in a predetemined order that is
+  // autograd hooks). Buckets are reduced in a predetermined order that is
   // identical across processes.
   struct BucketReplica {
     // Flattened (1 dimensional) contents of bucket.
@@ -231,6 +239,19 @@ class Reducer {
   // with the result of `future_work`.
   void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor);
 
+  // If gradient_as_bucket_view_ is false, after allreduce buckets,
+  // copy bucket results back to grads.
+  void copy_bucket_to_grad(
+      torch::autograd::Variable& variable,
+      Reducer::BucketReplica& replica,
+      size_t intra_bucket_index,
+      bool global_unused);
+  // Check layout of grad and bucket_view before calling copy_grad_to_bucket
+  void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view);
+  // If gradient_as_bucket_view_ is false, before allreduce buckets,
+  // copy grads to buckets.
+  void copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view);
+
   // A bucket holds N bucket replicas (1 per model replica).
   //
   // If every bucket in this struct is ready, the reduction can be kicked off.
@@ -267,6 +288,13 @@ class Reducer {
     size_t bucket_index;
     // Index of parameter in single bucket replica.
     size_t intra_bucket_index;
+
+    VariableLocator() = default;
+
+    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
+      bucket_index = bucket_index_;
+      intra_bucket_index = intra_bucket_index_;
+    }
   };
 
   // Map the index of a variable to its location in the bucket structure.
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index f85adb88dc09..f0b31b5389d2 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -27,12 +27,11 @@ namespace rpc {
 namespace {
 
 constexpr std::chrono::milliseconds kDeleteAllUsersTimeout(100000);
-constexpr float kSecToMsConversion = 1000;
 
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* rpc_init(PyObject* /* unused */) {
+PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
   auto rpc_module =
       THPObjectPtr(PyImport_ImportModule("torch.distributed.rpc"));
   if (!rpc_module) {
@@ -50,6 +49,11 @@ PyObject* rpc_init(PyObject* /* unused */) {
             :meth:`~torch.distributed.rpc.init_rpc` in order to initialize RPC
             with specific configurations, such as the RPC timeout and
             ``init_method`` to be used. )")
+          .def(py::init<>())
+          .def(
+              py::init<float, std::string>(),
+              py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds,
+              py::arg("init_method") = kDefaultInitMethod)
           .def_readwrite(
               "rpc_timeout",
               &RpcBackendOptions::rpcTimeoutSeconds,
@@ -77,7 +81,7 @@ PyObject* rpc_init(PyObject* /* unused */) {
             be constructed directly, rather, an instance can be retrieved
             through :meth:`~torch.distributed.rpc.get_worker_info` and the
             result can be passed in to functions such as
-            :meth:`~torch.distributed.rpc.rpc_sync`, :class:`~torch.distributed.rpc.rpc_async`,
+            :meth:`~torch.distributed.rpc.rpc_sync`, :meth:`~torch.distributed.rpc.rpc_async`,
             :meth:`~torch.distributed.rpc.remote` to avoid copying a string on
             every invocation.)")
           .def(
@@ -773,7 +777,7 @@ PyObject* rpc_init(PyObject* /* unused */) {
 } // namespace
 
 static PyMethodDef methods[] = { // NOLINT
-    {"_rpc_init", (PyCFunction)rpc_init, METH_NOARGS, nullptr},
+    {"_rpc_init", rpc_init, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 PyMethodDef* python_functions() {
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index fe93e43d01f3..d97577724a55 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -8,12 +8,6 @@
 namespace torch {
 namespace distributed {
 namespace rpc {
-const std::string kRPCTimeoutErrorStr =
-    "RPC ran for more than {} milliseconds and timed out.";
-
-namespace {
-constexpr auto kSecToMsConversion = 1000;
-}
 
 //////////////////////////  MessageCounter  /////////////////////////////////
 
@@ -802,7 +796,7 @@ void ProcessGroupAgent::pollTimedOutRPCs() {
 
     for (const auto& timedOutFuture : timedOutFutures) {
       auto errStr =
-          fmt::format(kRPCTimeoutErrorStr, timedOutFuture.timeout_.count());
+          fmt::format(kRpcTimeoutErrorStr, timedOutFuture.timeout_.count());
       auto err = makeRPCError(errStr, RPCErrorType::TIMEOUT);
 
       if (!timedOutFuture.future_->hasError()) {
diff --git a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
index f4baed5218b6..b45026b184fe 100644
--- a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
+++ b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
@@ -51,8 +51,7 @@ class State {
   // parse_cpu_trace(result) for results of all profile range.
   std::mutex resultsMutex_;
   std::vector<thread_event_lists> results_;
-  const ProfilerConfig config_ =
-      ProfilerConfig(ProfilerState::Disabled, false, false);
+  const ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
 };
 
 class StateStackEntry;
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index b68cb4092b67..c429fde123c6 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -502,6 +502,14 @@ void RequestCallbackImpl::processRpcWithErrors(
   }
 }
 
+bool RequestCallbackImpl::cudaAvailable() const {
+  #ifdef USE_CUDA
+  return true;
+  #else
+  return false;
+  #endif
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.h b/torch/csrc/distributed/rpc/request_callback_impl.h
index 0591cc88c7d0..836e496fb069 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.h
+++ b/torch/csrc/distributed/rpc/request_callback_impl.h
@@ -54,6 +54,8 @@ class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython {
       const MessageType& messageType,
       const int64_t messageId,
       const std::shared_ptr<FutureMessage>& responseFuture) const override;
+
+   bool cudaAvailable() const override;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index 9aa1f2b2aa55..d41c8f271104 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -482,97 +482,37 @@ void RequestCallbackNoPython::processRpc(
     case MessageType::RUN_WITH_PROFILING_REQ: {
       auto& rpcWithProfilingReq = static_cast<RpcWithProfilingReq&>(rpc);
       auto wrappedMsgType = rpcWithProfilingReq.wrappedMessageType();
-      const auto profilingConfig = rpcWithProfilingReq.getProfilingConfig();
+      auto profilingConfig = rpcWithProfilingReq.getProfilingConfig();
+      // If requested with CUDA from caller but CUDA is not available on this
+      // machine, fallback to CPU and log a warning instead of crashing.
+      if (profilingConfig.state ==
+              torch::autograd::profiler::ProfilerState::CUDA &&
+          !this->cudaAvailable()) {
+        profilingConfig = torch::autograd::profiler::ProfilerConfig(
+            torch::autograd::profiler::ProfilerState::CPU,
+            profilingConfig.report_input_shapes,
+            profilingConfig.profile_memory);
+
+        LOG(WARNING)
+            << "Profiler was requested to be enabled with CUDA on this node, but CUDA is not available. "
+            << "Falling back to CPU profiling only.";
+      }
+      TORCH_INTERNAL_ASSERT(
+          profilingConfig.state !=
+                  torch::autograd::profiler::ProfilerState::CUDA ||
+              this->cudaAvailable(),
+          "Profiler state set to CUDA but CUDA not available.");
       const auto profilingKeyId = rpcWithProfilingReq.getProfilingId();
       auto wrappedRpcResponseFuture = std::make_shared<FutureMessage>();
       // Enable the profiler with the config from the sender.
-      std::vector<torch::autograd::profiler::Event> profiledEvents;
+      // When enabling on the main thread, ensure profiler states are cleaned
+      // up, but defer consolidation of all profiled events to the continuation
+      // below.
+      torch::autograd::profiler::ProfilerDisableOptions requestThreadOptions(
+          true /* cleanup TLS state */, false /* consolidate events */);
       {
         torch::autograd::profiler::TLSProfilerGuard g(
-            profilingConfig,
-            [&profiledEvents, profilingConfig](
-                const std::vector<std::vector<
-                    torch::autograd::profiler::Event>>& event_lists) {
-              // Gather all events into a vector
-              for (auto& l : event_lists) {
-                for (auto& e : l) {
-                  profiledEvents.push_back(e);
-                }
-              }
-              // find __start_profile event and __cuda_start_event.
-              bool cuda_profiling_enabled = profilingConfig.state ==
-                  torch::autograd::profiler::ProfilerState::CUDA;
-              bool found_cpu_start = false;
-              const torch::autograd::profiler::Event* profilerStart = nullptr;
-              // Each device has its own cudaProfilerStart, so we must take
-              // care to use the correct one depending on the device the
-              // operation ran on.
-              std::unordered_map<int, const torch::autograd::profiler::Event*>
-                  cudaProfilerStarts;
-              for (auto& e : profiledEvents) {
-                if (!found_cpu_start &&
-                    0 == strcmp(e.name(), "__start_profile")) {
-                  profilerStart = &e;
-                  found_cpu_start = true;
-                }
-                if (cuda_profiling_enabled &&
-                    0 == strcmp(e.name(), "__cuda_start_event")) {
-                  e.setCudaUs(e.cpu_us());
-                  auto device = e.device();
-                  TORCH_CHECK(
-                      device != -1,
-                      "CUDA profiling was enabled but could not find CUDA device.");
-                  TORCH_CHECK(
-                      cudaProfilerStarts.find(device) ==
-                          cudaProfilerStarts.end(),
-                      c10::str(
-                          "Duplicate __cuda_start_event found for ", device));
-                  cudaProfilerStarts[device] = &e;
-                }
-                // TODO: determine no. of CUDA devices and break here if we have
-                // a cudaProfilerStart for all of them, in the case of cuda
-                // profiling.
-                if (found_cpu_start && !cuda_profiling_enabled) {
-                  break;
-                }
-              }
-              // We should always find __start_profile.
-              TORCH_CHECK(
-                  profilerStart != nullptr,
-                  "Expected to find __start_profile event.");
-              // Should have >= 1 CUDA start event.
-              // TODO: we can enhance this assert by ensuring we have found a
-              // start for every available CUDA device.
-              TORCH_CHECK(
-                  !cuda_profiling_enabled || cudaProfilerStarts.size() > 0,
-                  "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
-
-              if (cuda_profiling_enabled) {
-                // Compute and set global time for when this CUDA kernel was
-                // launched/ended, since deserialized event will not have a
-                // corresponding CUDA event.
-                for (auto& e : profiledEvents) {
-                  if (e.has_cuda()) {
-                    auto cuda_device = e.device();
-                    TORCH_CHECK(
-                        cuda_device != -1,
-                        "CUDA profiling was enabled but could not find CUDA device.");
-                    auto it = cudaProfilerStarts.find(cuda_device);
-                    TORCH_CHECK(
-                        it != cudaProfilerStarts.end(),
-                        c10::str(
-                            "Failed to find __cuda_start_event for device ",
-                            cuda_device));
-                    auto cudaProfilerStartEvent = it->second;
-                    double cuda_elapsed_us =
-                        cudaProfilerStartEvent->cuda_elapsed_us(e);
-                    int64_t cuda_us =
-                        cuda_elapsed_us + cudaProfilerStartEvent->cpu_us();
-                    e.setCudaUs(cuda_us);
-                  }
-                }
-              }
-            });
+            profilingConfig, c10::nullopt, requestThreadOptions);
         TORCH_INTERNAL_ASSERT(
             torch::autograd::profiler::profilerEnabled(),
             "Expected profiler to be enabled!");
@@ -583,25 +523,48 @@ void RequestCallbackNoPython::processRpc(
             wrappedMsgType,
             messageId,
             wrappedRpcResponseFuture);
-      }
-      wrappedRpcResponseFuture->addCallback([wrappedRpcResponseFuture,
+
+        wrappedRpcResponseFuture->addCallback(
+            at::wrapPropagateTLSState<void>([wrappedRpcResponseFuture,
                                              responseFuture,
-                                             profiledEvents =
-                                                 std::move(profiledEvents),
-                                             profilingKeyId] {
-        if (wrappedRpcResponseFuture->hasError()) {
-          // Propagate error
-          responseFuture->setError(wrappedRpcResponseFuture->error()->what());
-        } else {
-          auto rpcWithProfilingResp = std::make_unique<RpcWithProfilingResp>(
-              MessageType::RUN_WITH_PROFILING_RESP,
-              std::move(*wrappedRpcResponseFuture).moveValue(),
-              profiledEvents,
-              profilingKeyId);
-          responseFuture->markCompleted(
-              std::move(*rpcWithProfilingResp).toMessage());
-        }
-      });
+                                             profilingKeyId,
+                                             profilingConfig] {
+              std::vector<torch::autograd::profiler::Event> profiledEvents;
+              // Defer consolidation of profiler events until async work has
+              // completed (such as async UDF)
+
+              TORCH_INTERNAL_ASSERT(
+                  torch::autograd::profiler::profilerEnabled(),
+                  "Expected profiler to be enabled!");
+
+              // On continuation thread, don't clean up profiler states, since
+              // they will be cleaned up by main thread, and consolidate all
+              // events so we obtain asynchronously run events.
+              torch::autograd::profiler::ProfilerDisableOptions opts(
+                  false, true);
+              auto event_lists =
+                  torch::autograd::profiler::disableProfiler(opts);
+              if (wrappedRpcResponseFuture->hasError()) {
+                // Propagate error
+                // No need to propagate remote events in the case of an error.
+                responseFuture->setError(
+                    wrappedRpcResponseFuture->error()->what());
+              } else {
+                populateRemoteProfiledEvents(
+                    profiledEvents, profilingConfig, event_lists);
+                auto rpcWithProfilingResp =
+                    std::make_unique<RpcWithProfilingResp>(
+                        MessageType::RUN_WITH_PROFILING_RESP,
+                        std::move(*wrappedRpcResponseFuture).moveValue(),
+                        profiledEvents,
+                        profilingKeyId);
+                responseFuture->markCompleted(
+                    std::move(*rpcWithProfilingResp).toMessage());
+              }
+            }));
+        // Exiting the scope will disable the profiler on this thread with the
+        // options specified above.
+      }
       return;
     }
     default: {
@@ -627,6 +590,14 @@ Message RequestCallbackNoPython::handleError(
   return createExceptionResponse(errorMsg, messageId);
 }
 
+bool RequestCallbackNoPython::cudaAvailable() const {
+  #ifdef USE_CUDA
+  return true;
+  #else
+  return false;
+  #endif
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.h b/torch/csrc/distributed/rpc/request_callback_no_python.h
index dd54ea009417..b54fe172d7b6 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.h
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.h
@@ -84,6 +84,8 @@ class TORCH_API RequestCallbackNoPython : public RequestCallback {
       const std::exception& e,
       const MessageType messageType,
       int64_t messageId) const;
+
+  virtual bool cudaAvailable() const;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index 605744a1f227..34b77a085510 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -17,6 +17,9 @@ constexpr float kDefaultRpcTimeoutSeconds = 60;
 // timeout for RPCs.
 constexpr float kUnsetRpcTimeout = -1;
 constexpr auto kDefaultInitMethod = "env://";
+constexpr float kSecToMsConversion = 1000;
+constexpr auto kRpcTimeoutErrorStr =
+    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
 
 using steady_clock_time_point =
     std::chrono::time_point<std::chrono::steady_clock>;
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index 34249172473c..6c6a377a4652 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -141,9 +141,6 @@ IValue UserRRef::toHere(const float timeoutSeconds) const {
         "to_here#({})->({})",
         RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_,
         RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_);
-    auto& remoteProfilerManager =
-        torch::distributed::rpc::RemoteProfilerManager::getInstance();
-    remoteProfilerManager.setCurrentKey(toHereKey);
   }
   RECORD_USER_SCOPE(toHereKey);
   TORCH_CHECK(
@@ -170,12 +167,16 @@ IValue UserRRef::toHere(const float timeoutSeconds) const {
     msgToSend = ScriptRRefFetchCall(ownerId_, rrefId()).toMessage();
   }
 
+  // toHere is profiled as a blocking call, and does not execute operations on
+  // the remote node. Hence, don't wrap it with a profiling message since we
+  // don't need the profiler to be enabled remotely.
   auto futureResponse = autograd::sendMessageWithAutograd(
       *agent,
       agent->getWorkerInfo(ownerId_),
       std::move(msgToSend),
       true /* forceGradRecording */,
-      timeoutSeconds);
+      timeoutSeconds,
+      true /* forceDisableProfiling */);
 
   // TODO: we should ideally be able to interrupt this blocking wait if we check
   // getTimedOut() and it is true
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index d9ce2c3b27eb..11c5408c2c35 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -22,16 +22,12 @@ namespace {
 const std::string kSocketIfnameEnvVar = "TP_SOCKET_IFNAME";
 const std::string kDefaultUvAddress = "127.0.0.1";
 
-constexpr long kToMilliseconds = 1000;
-
 const std::string kGilAverageWaitTime = "agent.gil_average_wait_time_us";
 const std::string kThreadPoolSize = "agent.thread_pool_size";
 const std::string kNumIdleThreads = "agent.num_idle_threads";
 const std::string kClientActiveCalls = "agent.client_active_calls";
 const std::string kServerActiveCalls = "agent.server_active_calls";
 const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls";
-const std::string kRpcTimeoutErrorStr =
-    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
 
 inline void checkCPUTensor(const torch::Tensor& tensor) {
   TORCH_CHECK(
@@ -273,7 +269,7 @@ TensorPipeAgent::TensorPipeAgent(
           WorkerInfo(std::move(selfName), selfId),
           std::move(cb),
           std::chrono::milliseconds(
-              (long)(opts.rpcTimeoutSeconds * kToMilliseconds))),
+              (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))),
       opts_(std::move(opts)),
       threadPool_(opts_.numWorkerThreads),
       context_(std::make_shared<tensorpipe::Context>(
@@ -685,7 +681,7 @@ std::shared_ptr<FutureMessage> TensorPipeAgent::send(
   auto timeout = rpcTimeoutSeconds == kUnsetRpcTimeout
       ? getRpcTimeout()
       : std::chrono::milliseconds(
-            static_cast<int>(rpcTimeoutSeconds * kToMilliseconds));
+            static_cast<int>(rpcTimeoutSeconds * kSecToMsConversion));
 
   // We only add to the timeoutMap_ if the timeout is not 0. Per our
   // documentation, a user-provided timeout of 0 indicates the RPC should never
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
index a03ff5cafecd..a1be688a285e 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
@@ -6,10 +6,6 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
-namespace {
-constexpr auto kSecToMsConversion = 1000;
-}
-
 std::string fromVec(const std::vector<char>& vec) {
   return std::string(vec.begin(), vec.end());
 }
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
index cdb67e2ea6b5..a662faed88ba 100644
--- a/torch/csrc/distributed/rpc/testing/init.cpp
+++ b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -17,7 +17,7 @@ namespace {
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* faulty_agent_init(PyObject* /* unused */) {
+PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
   // Add the FaultyProcessGroupAgent and its backend options object to the
   // python module torch.distributed.rpc._testing
   auto faulty_agent_module =
@@ -110,7 +110,7 @@ PyObject* faulty_agent_init(PyObject* /* unused */) {
 
 static PyMethodDef methods[] = { // NOLINT
     {"_faulty_agent_init",
-     (PyCFunction)faulty_agent_init,
+     faulty_agent_init,
      METH_NOARGS,
      nullptr},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 981cfd50f95e..fa97ea116a0c 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -501,6 +501,85 @@ std::vector<at::IValue> readWrappedPayload(
   payload.resize(payload.size() - additionalPayloadSize);
   return tupleElements;
 }
+
+void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilingConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        eventLists) {
+  // Gather all events into a vector
+  for (auto& l : eventLists) {
+    for (auto& e : l) {
+      profiledEvents.push_back(e);
+    }
+  }
+  // find __start_profile event and __cuda_start_event.
+  bool cudaProfilingEnabled =
+      profilingConfig.state == torch::autograd::profiler::ProfilerState::CUDA;
+  bool foundCpuStart = false;
+  const torch::autograd::profiler::Event* profilerStart = nullptr;
+  // Each device has its own cudaProfilerStart, so we must take
+  // care to use the correct one depending on the device the
+  // operation ran on.
+  std::unordered_map<int, const torch::autograd::profiler::Event*>
+      cudaProfilerStarts;
+  for (auto& e : profiledEvents) {
+    if (!foundCpuStart && 0 == strcmp(e.name(), "__start_profile")) {
+      profilerStart = &e;
+      foundCpuStart = true;
+    } else if (cudaProfilingEnabled && 0 == strcmp(e.name(), "__cuda_start_event")) {
+      e.setCudaUs(e.cpuUs());
+      auto device = e.device();
+      TORCH_CHECK(
+          device != -1,
+          "CUDA profiling was enabled but could not find CUDA device.");
+      TORCH_CHECK(
+          cudaProfilerStarts.find(device) == cudaProfilerStarts.end(),
+          c10::str("Duplicate __cuda_start_event found for ", device));
+      cudaProfilerStarts[device] = &e;
+    }
+
+    // TODO: determine no. of CUDA devices and break here if we have
+    // a cudaProfilerStart for all of them, in the case of cuda
+    // profiling.
+    if (foundCpuStart && !cudaProfilingEnabled) {
+      break;
+    }
+  }
+  // We should always find __start_profile.
+  TORCH_CHECK(
+      profilerStart != nullptr, "Expected to find __start_profile event.");
+  // Should have >= 1 CUDA start event if cudaProfilingEnabled.
+  // TODO: we can enhance this assert by ensuring we have found a
+  // start for every available CUDA device.
+  TORCH_CHECK(
+      !cudaProfilingEnabled || cudaProfilerStarts.size() > 0,
+      "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
+
+  if (cudaProfilingEnabled) {
+    // Compute and set global time for when this CUDA kernel was
+    // launched/ended, since deserialized event will not have a
+    // corresponding CUDA event.
+    for (auto& e : profiledEvents) {
+      if (e.hasCuda()) {
+        auto cudaDevice = e.device();
+        TORCH_CHECK(
+            cudaDevice != -1,
+            "CUDA profiling was enabled but could not find CUDA device.");
+        auto it = cudaProfilerStarts.find(cudaDevice);
+        TORCH_CHECK(
+            it != cudaProfilerStarts.end(),
+            c10::str(
+                "Failed to find __cuda_start_event for device ", cudaDevice));
+        auto cudaProfilerStartEvent = it->second;
+        double cudaElapsedUs = cudaProfilerStartEvent->cudaElapsedUs(e);
+        int64_t cudaUs = cudaElapsedUs + cudaProfilerStartEvent->cpuUs();
+        e.setCudaUs(cudaUs);
+      }
+    }
+  }
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h
index 806b52208eb0..f91dfb4f4c7d 100644
--- a/torch/csrc/distributed/rpc/utils.h
+++ b/torch/csrc/distributed/rpc/utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Device.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/csrc/utils/byte_order.h>
@@ -78,6 +79,14 @@ TORCH_API std::vector<at::IValue> readWrappedPayload(
     std::vector<char>& payload,
     const rpc::Message& message);
 
+// Takes a list of events from autograd profiler and populates them into
+// profiledEvents to be carried over RPC.
+TORCH_API void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilerConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        eventLists);
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index b01cb62dc3a2..17c92cb14023 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -226,11 +226,13 @@ void initJitBackendBindings(PyObject* module) {
   m.def(
       "_jit_to_backend",
       [=](const std::string& backend_name,
-          const Module& orig_module,
+          py::handle orig_module,
           const py::dict& method_compile_spec) {
         return py::module::import("torch.jit._recursive")
-            .attr("wrap_cpp_module")(
-                codegen_lambda(backend_name, orig_module, method_compile_spec));
+            .attr("wrap_cpp_module")(codegen_lambda(
+                backend_name,
+                py::cast<Module>(orig_module.attr("_c")),
+                method_compile_spec));
       });
 }
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
new file mode 100644
index 000000000000..f6e791f0edba
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -0,0 +1,640 @@
+
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+#include <sstream>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace codegen {
+
+namespace {
+
+class CudaKernelGenerator : private OptInConstDispatch {
+  static constexpr char* kTab = "  ";
+
+ public:
+  static std::string generateKernelDefinition(
+      const Kernel* kernel,
+      const std::string& kernel_name) {
+    CudaKernelGenerator codegen(kernel);
+    codegen.genDeclaration(kernel_name);
+    codegen.startBlock();
+    codegen.genPrologue();
+    codegen.genBody();
+    codegen.endBlock();
+    TORCH_CHECK(codegen.block_nest_level_ == 0);
+    return codegen.code_.str();
+  }
+
+ private:
+  explicit CudaKernelGenerator(const Kernel* kernel) : kernel_(kernel) {}
+
+  // Generates the kernel function declaration
+  void genDeclaration(const std::string& kernel_name) {
+    const auto& kernel_summary = kernel_->summary();
+
+    code_ << "__global__ void " << kernel_name << "(";
+
+    std::vector<Val*> params;
+
+    // Inputs
+    for (auto val : kernel_->inputs()) {
+      params.push_back(val);
+    }
+
+    // Outputs
+    for (auto val : kernel_->outputs()) {
+      params.push_back(val);
+    }
+
+    // Global buffers
+    for (auto allocate : kernel_summary.global_allocations) {
+      params.push_back(allocate->buffer());
+    }
+
+    // Generate parameter declarations
+    for (Val* val : params) {
+      switch (val->getValType().value()) {
+        case ValType::KirTensorView: {
+          // TODO(kir): review this
+          const auto tv = val->as<kir::TensorView>();
+          code_ << "Tensor<" << val->getDataType().value() << ", "
+                << TensorDomain::noReductions(
+                       tv->fuserTv()->getMaybeRFactorDomain())
+                       .size()
+                << "> " << gen(tv);
+          break;
+        }
+        case ValType::KirScalar:
+          code_ << val->getDataType().value() << " " << gen(val);
+          break;
+        default:
+          TORCH_CHECK(!"Unexpected parameter type");
+      }
+
+      if (val != params.back()) {
+        code_ << ", ";
+      }
+    }
+
+    // Kernels generating random numbers take extra (seed, offset) arguments
+    if (kernel_summary.is_stochastic) {
+      code_ << ", unsigned long long seed, unsigned long long offset";
+    }
+
+    code_ << ") ";
+  }
+
+  // Generates setup code which is executed before the kernel body
+  void genPrologue() {
+    const auto& kernel_summary = kernel_->summary();
+
+    // Random number generator (optional)
+    if (kernel_summary.is_stochastic) {
+      indent() << "const int idx = blockIdx.x*blockDim.x + threadIdx.x;\n";
+      indent() << "Philox rnd(seed, idx, offset);\n";
+    }
+
+    // Do we have any dynamic shared memory buffers?
+    const bool has_dynamic_smem =
+        !kernel_summary.dynamic_smem_allocations.empty();
+
+    // Do we have any reductions?
+    const bool has_reductions = kernel_summary.has_block_reductions ||
+        kernel_summary.has_grid_reductions;
+
+    // Shared memory
+    if (has_dynamic_smem || has_reductions) {
+      indent() << "alignas("
+               << dataTypeSize(kernel_summary.largest_smem_data_type)
+               << ") extern __shared__ char array[];\n";
+
+      if (has_dynamic_smem) {
+        indent() << "unsigned offset = 0;\n";
+      }
+
+      if (has_reductions) {
+        indent() << "void* shared_mem = array;\n";
+        if (has_dynamic_smem) {
+          indent() << "offset += "
+                   << "((blockDim.x * blockDim.y * blockDim.z) * sizeof("
+                   << kernel_summary.largest_smem_data_type << "));\n";
+        }
+      }
+    }
+  }
+
+  void genBody() {
+    for (auto expr : kernel_->topLevelExprs()) {
+      OptInConstDispatch::handle(expr);
+    }
+  }
+
+  void startBlock(bool continuation = false) {
+    if (continuation) {
+      code_ << "{\n";
+    } else {
+      indent() << "{\n";
+    }
+    ++block_nest_level_;
+  }
+
+  void endBlock(const char* sep = "\n") {
+    --block_nest_level_;
+    TORCH_CHECK(block_nest_level_ >= 0);
+    indent() << "}" << sep;
+  }
+
+  std::ostream& indent() {
+    for (int i = 0; i < block_nest_level_; ++i) {
+      code_ << kTab;
+    }
+    return code_;
+  }
+
+  std::string gen(const Statement* stmt) {
+    std::stringstream tmp_code;
+    std::swap(tmp_code, code_);
+    handle(stmt);
+    std::swap(tmp_code, code_);
+    return tmp_code.str();
+  }
+
+  std::string gen(const kir::TensorView* tv) {
+    std::stringstream tv_name;
+    tv_name << "T" << tv->name();
+    return tv_name.str();
+  }
+
+  std::string genInline(const Statement* stmt) {
+    const bool saved_inline = print_inline_;
+    print_inline_ = true;
+    const auto result = gen(stmt);
+    print_inline_ = saved_inline;
+    return result;
+  }
+
+  void handle(const Statement* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const Expr* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const Val* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const kir::Bool* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "b" << node->name();
+    } else {
+      code_ << *node->value();
+    }
+  }
+
+  void handle(const kir::Float* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "f" << node->name();
+    } else {
+      const int digits = std::numeric_limits<Float::ScalarType>::max_digits10;
+      code_ << "float(" << std::setprecision(digits) << *node->value() << ")";
+    }
+  }
+
+  void handle(const kir::Half* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "h" << node->name();
+    } else {
+      code_ << "__float2half(" << *node->value() << ")";
+    }
+  }
+
+  void handle(const kir::Int* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "i" << node->name();
+    } else {
+      code_ << *node->value();
+    }
+  }
+
+  void handle(const kir::NamedScalar* node) final {
+    code_ << node->name();
+  }
+
+  void handle(const kir::TensorIndex* node) final {
+    code_ << gen(node->view()) << "[";
+
+    bool first = true;
+    for (auto* ind : node->indices()) {
+      if (!ind->isZeroInt()) {
+        if (!first) {
+          code_ << " + ";
+        }
+        code_ << genInline(ind);
+        first = false;
+      }
+    }
+
+    if (first) {
+      code_ << "0";
+    }
+
+    code_ << "]";
+  }
+
+  void handle(const kir::IterDomain* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::TensorDomain* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::TensorView* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::UnaryOp* node) final {
+    if (!print_inline_) {
+      indent() << gen(node->out());
+      if (!node->out()->isScalar() && !node->in()->isScalar()) {
+        code_ << "\n";
+        indent() << kTab;
+      }
+      code_ << " = ";
+    }
+
+    if (auto op = inline_op_str(node->getUnaryOpType())) {
+      code_ << *op << gen(node->in());
+    } else {
+      if (node->getUnaryOpType() == UnaryOpType::Cast) {
+        const auto cast_str =
+            cast_func_str({node->in()->getDataType().value(),
+                           node->out()->getDataType().value()});
+        code_ << cast_str.value();
+      } else {
+        code_ << node->getUnaryOpType();
+      }
+
+      code_ << "(";
+      if (node->getUnaryOpType() == UnaryOpType::RandLike) {
+        code_ << "rnd";
+      } else {
+        code_ << gen(node->in());
+      }
+      code_ << ")";
+    }
+
+    if (!print_inline_) {
+      code_ << ";\n";
+    }
+  }
+
+  std::string genBinaryOp(
+      BinaryOpType op_type,
+      const std::string& lhs,
+      const std::string& rhs) {
+    std::stringstream expr;
+    if (auto op = inline_op_str(op_type)) {
+      expr << lhs << " " << *op << " " << rhs;
+    } else {
+      expr << op_type << "(" << lhs << ", " << rhs << ")";
+    }
+    return expr.str();
+  }
+
+  void handle(const kir::BinaryOp* node) final {
+    const auto op_type = node->getBinaryOpType();
+    if (print_inline_) {
+      // Inline expression: `lhs op rhs`
+      code_ << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs()));
+    } else {
+      indent() << gen(node->out());
+      if (node->out()->isScalar()) {
+        // Single line: `out = lhs op rhs;`
+        code_ << " = "
+              << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs()));
+      } else {
+        // Split TensorView expressions across multiple lines:
+        //
+        // out
+        //    =  lhs
+        //    op rhs;
+        //
+        if (auto op = inline_op_str(op_type)) {
+          code_ << "\n";
+          indent() << kTab << "= " << gen(node->lhs()) << "\n";
+          indent() << kTab << *op << " " << gen(node->rhs());
+        } else {
+          code_ << " = " << op_type << "(\n";
+          indent() << kTab << gen(node->lhs()) << ",\n";
+          indent() << kTab << gen(node->rhs()) << ")";
+        }
+      }
+      code_ << ";\n";
+    }
+  }
+
+  void handle(const kir::TernaryOp* node) final {
+    if (!print_inline_) {
+      indent() << gen(node->out());
+      if (!node->out()->isScalar()) {
+        code_ << "\n";
+        indent() << kTab;
+      }
+      code_ << " = ";
+    }
+
+    code_ << node->getTernaryOpType() << "(" << gen(node->in1()) << ", "
+          << gen(node->in2()) << ", " << gen(node->in3()) << ")";
+
+    if (!print_inline_) {
+      code_ << ";\n";
+    }
+  }
+
+  std::string genReductionOp(BinaryOpType op_type, DataType data_type) {
+    std::stringstream lambda;
+    lambda << "[](" << data_type << " &a, " << data_type << " b) "
+           << "{ a = " << genBinaryOp(op_type, "a", "b") << "; }";
+    return lambda.str();
+  }
+
+  void handle(const kir::BroadcastOp* node) final {
+    const ir_utils::ParallelTypeBitmap domains =
+        ir_utils::getParallelBroadcastDomains(
+            node->out(), kernel_->predicateMap());
+
+    const bool thread_x = domains.get(ParallelType::TIDx);
+    const bool thread_y = domains.get(ParallelType::TIDy);
+    const bool thread_z = domains.get(ParallelType::TIDz);
+    const bool block_x = domains.get(ParallelType::BIDx);
+    const bool block_y = domains.get(ParallelType::BIDy);
+    const bool block_z = domains.get(ParallelType::BIDz);
+
+    const bool grid_broadcast_needed = block_x || block_y || block_z;
+    const bool block_broadcast_needed = thread_x || thread_y || thread_z;
+
+    TORCH_INTERNAL_ASSERT(
+        !grid_broadcast_needed,
+        "Parallel broadcast across blocks not supported");
+
+    if (block_broadcast_needed) {
+      const auto data_type = node->out()->getDataType().value();
+      indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false")
+               << ", " << (thread_y ? "true" : "false") << ", "
+               << (thread_z ? "true" : "false") << ">(\n";
+      indent() << kTab << gen(node->out()) << ",\n";
+      indent() << kTab << gen(node->in()) << ",\n";
+      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem));\n";
+    } else {
+      indent() << gen(node->out()) << "\n";
+      indent() << kTab << " = " << gen(node->in()) << ";\n";
+    }
+  }
+
+  void handle(const kir::ReductionOp* node) final {
+    TORCH_CHECK(node->out()->getValType() == ValType::TensorIndex);
+
+    const auto out = node->out()->as<kir::TensorIndex>();
+    const auto domain = out->view()->domain();
+
+    const bool has_block_reduce = domain->hasBlockReduction();
+    const bool has_grid_reduce = domain->hasGridReduction();
+
+    if (!has_block_reduce && !has_grid_reduce) {
+      const auto gen_out = gen(out);
+      const auto op_type = node->getReductionOpType();
+      indent() << gen_out << " = "
+               << genBinaryOp(op_type, gen_out, gen(node->in())) << ";\n";
+      return;
+    }
+
+    const auto par_domains = node->getParallelReductionDomains();
+    const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
+    const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
+    const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
+
+    const auto data_type = node->out()->getDataType().value();
+    const auto op_type = node->getReductionOpType();
+
+    if (has_block_reduce) {
+      if (has_grid_reduce) {
+        indent() << data_type << " "
+                 << "block_result"
+                 << ";\n";
+      }
+      indent() << "blockReduce<" << (tidx ? "true" : "false") << ", "
+               << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false")
+               << ">(\n";
+      if (has_grid_reduce) {
+        indent() << kTab << "block_result"
+                 << ",\n";
+      } else {
+        indent() << kTab << gen(node->out()) << ",\n";
+      }
+      indent() << kTab << gen(node->in()) << ",\n";
+      indent() << kTab << genReductionOp(op_type, data_type) << ",\n";
+      indent() << kTab << "threadIdx,\n";
+      indent() << kTab << "blockDim,\n";
+      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+      if (node->pred() == nullptr) {
+        indent() << kTab << "true,\n";
+      } else {
+        indent() << kTab << genInline(node->pred()) << ",\n";
+      }
+      indent() << kTab << genInline(node->init()) << ");\n";
+    }
+  }
+
+  void handle(const kir::GridReduction* node) final {
+    const auto rop = node->reduction_op();
+    TORCH_INTERNAL_ASSERT(rop->out()->getValType() == ValType::TensorIndex);
+
+    const auto out = rop->out()->as<kir::TensorIndex>();
+    const auto domain = out->view()->domain();
+    TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
+
+    const auto par_domains = rop->getParallelReductionDomains();
+    const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
+    const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
+    const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
+    const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end();
+    const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end();
+    const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end();
+
+    const auto data_type = rop->out()->getDataType().value();
+    const auto op_type = rop->getReductionOpType();
+
+    TORCH_INTERNAL_ASSERT(
+        node->reduction_buffer()->buffer()->getValType().value() ==
+        ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        node->sync_buffer()->buffer()->getValType().value() ==
+        ValType::KirTensorView);
+    const auto work_buffer =
+        node->reduction_buffer()->buffer()->as<kir::TensorView>();
+    const auto sync_buffer =
+        node->sync_buffer()->buffer()->as<kir::TensorView>();
+
+    // Since block-level reduction is already done, those dimensions
+    // with tidx/y/z being true do not participate in the grid reduction.
+    indent() << kir::GridReduction::getPredicateFlagName(out->view()) << " = "
+             << "reduction::gridReduce<" << (bidx ? "true" : "false") << ", "
+             << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false")
+             << ", " << (!tidx ? "true" : "false") << ", "
+             << (!tidy ? "true" : "false") << ", " << (!tidz ? "true" : "false")
+             << ">(\n";
+    indent() << kTab << gen(rop->out()) << ",\n";
+    if (domain->hasBlockReduction()) {
+      indent() << kTab << "block_result"
+               << ",\n";
+    } else {
+      indent() << kTab << gen(rop->in()) << ",\n";
+    }
+    indent() << kTab << genReductionOp(op_type, data_type) << ",\n";
+    indent() << kTab << "&" << gen(work_buffer) << "[0],\n";
+    indent() << kTab << gen(sync_buffer) << ",\n";
+    indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+    if (node->pred() == nullptr) {
+      indent() << kTab << "true,\n";
+    } else {
+      indent() << kTab << genInline(node->pred()) << ",\n";
+    }
+    indent() << kTab << genInline(node->reduction_op()->init()) << ");\n";
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  // TODO(Kir): fix me
+  void handle(const kir::Scope& scope) {
+    for (auto expr : scope.exprs()) {
+      handle(expr);
+    }
+  }
+#pragma clang diagnostic pop
+
+  void handle(const kir::ForLoop* node) final {
+    // TODO(kir): handle this during lowering
+    if (node->iter_domain()->isThread() || node->iter_domain()->isBroadcast()) {
+      handle(node->body());
+      return;
+    }
+
+    const auto gen_index = gen(node->index());
+    const auto gen_start = genInline(node->iter_domain()->start());
+    const auto gen_extent = genInline(node->iter_domain()->extent());
+    indent() << "for(size_t " << gen_index << " = " << gen_start << "; "
+             << gen_index << " < " << gen_extent << "; ++" << gen_index << ") ";
+
+    startBlock(true);
+    handle(node->body());
+    endBlock();
+  }
+
+  void handle(const kir::IfThenElse* node) final {
+    indent() << "if (" << genInline(node->cond()) << ") ";
+
+    // "then" block
+    startBlock(true);
+    handle(node->thenBody());
+
+    // "else" block (optional)
+    if (node->hasElse()) {
+      endBlock(" else ");
+      startBlock(true);
+      handle(node->elseBody());
+    }
+
+    endBlock();
+  }
+
+  // TODO(kir): fold initialization into Allocate
+  void handle(const kir::Allocate* node) final {
+    if (node->buffer()->getValType().value() != ValType::KirTensorView) {
+      indent() << node->buffer_type() << " " << gen(node->buffer()) << ";\n";
+      return;
+    }
+
+    const auto tv = node->buffer()->as<kir::TensorView>();
+    TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0);
+    TORCH_INTERNAL_ASSERT(node->size() != nullptr);
+
+    switch (tv->memoryType()) {
+      case MemoryType::Global:
+        indent() << "// Allocate global tensor " << gen(tv) << "\n";
+        break;
+      case MemoryType::Shared:
+        if (node->size()->isConstScalar()) {
+          // Static shared memory
+          indent() << "__shared__ " << node->buffer_type() << " " << gen(tv)
+                   << "[" << genInline(node->size()) << "];\n";
+        } else {
+          // Align Offset Position
+          indent() << "offset = alignBufferSize(offset,"
+                   << dataTypeSize(node->buffer_type()) << ");\n";
+          // Shared Memory Pointer
+          indent() << node->buffer_type() << "* " << gen(tv)
+                   << " = reinterpret_cast<" << node->buffer_type() << "*>"
+                   << "(array + offset);\n";
+          // Increment Offset Position
+          indent() << "offset += (" << genInline(node->size()) << " * sizeof("
+                   << node->buffer_type() << "));\n";
+        }
+        break;
+      case MemoryType::Local:
+        indent() << node->buffer_type() << " " << gen(tv) << "["
+                 << genInline(node->size()) << "];\n";
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Unexpected memory type");
+    }
+  }
+
+  void handle(const kir::Sync* node) final {
+    indent() << "__syncthreads();\n";
+  }
+
+ private:
+  std::stringstream code_;
+  const Kernel* kernel_;
+  int block_nest_level_ = 0;
+
+  // TODO(kir): replace with explicit assignment statements
+  bool print_inline_ = false;
+};
+
+} // namespace
+
+std::string generateCudaKernel(
+    const Kernel* kernel,
+    const std::string& kernel_name) {
+  FUSER_PERF_SCOPE("generateCudaKernel");
+  return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name);
+}
+
+} // namespace codegen
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h
new file mode 100644
index 000000000000..562aa1554eb2
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/codegen.h
@@ -0,0 +1,22 @@
+
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace codegen {
+
+//! Generates a CUDA kernel definition for the given kernel
+TORCH_CUDA_API std::string generateCudaKernel(
+    const Kernel* kernel,
+    const std::string& kernel_name = "CUDAGeneratedKernel");
+
+} // namespace codegen
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp
index 3e0f5303b966..9f8f7aba1cf4 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/compute_at.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -20,11 +21,10 @@ ComputeAtData::ComputeAtData(TensorView* tv)
 void ComputeAtData::clearPass() {
   // If the last pass set a position, update the new_compute_at_position if
   // latest position would be greater than previously set.
-  auto pass_pos = current_traversal_position_set ? current_traversal_position
-                                                 : new_compute_at_position;
-
-  new_compute_at_position =
-      pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position;
+  if (current_traversal_position_set &&
+      current_traversal_position > new_compute_at_position) {
+    new_compute_at_position = current_traversal_position;
+  }
 
   current_traversal_position_set = false;
   current_traversal_position = 0;
@@ -52,16 +52,19 @@ void ComputeAtData::setPassPosition(unsigned int pos) {
 }
 
 unsigned int ComputeAtData::getNewPosition() const {
-  // If the last pass set a position, update the new_compute_at_position if
-  // latest position would be greater than previously set.
-  auto pass_pos = current_traversal_position_set ? current_traversal_position
-                                                 : new_compute_at_position;
-
-  return pass_pos > new_compute_at_position ? pass_pos
-                                            : new_compute_at_position;
+  // If the last pass set a position, return the latest position if
+  // it would be greater than previously set.
+  if (current_traversal_position_set &&
+      current_traversal_position > new_compute_at_position) {
+    return current_traversal_position;
+  } else {
+    return new_compute_at_position;
+  }
 }
 
 void ComputeAtData::validateNewComputeAt() const {
+  FUSER_PERF_SCOPE("validateNewComputeAt");
+
   TORCH_INTERNAL_ASSERT(
       getNewPosition() >= original_compute_at_position,
       "Invalid computeAt detected. This computeAt would invalidate the set computeAt on ",
@@ -82,7 +85,22 @@ void ComputeAtData::validateNewComputeAt() const {
       ".");
 }
 
+void ComputeAtData::setComputeAtDomain(TensorDomain* td) {
+  if (new_compute_at_domain_ != original_domain_) {
+    TORCH_INTERNAL_ASSERT(
+        *new_compute_at_domain_ == *td,
+        "TensorDomain, ",
+        td,
+        ", does not match with the previously set domain of ",
+        tv_ref_,
+        ", which is ",
+        new_compute_at_domain_);
+  }
+  new_compute_at_domain_ = td;
+}
+
 namespace {
+
 // Wrapper around set_intersection
 template <typename T>
 std::set<T> set_intersection(const std::set<T>& set1, const std::set<T>& set2) {
@@ -121,12 +139,15 @@ std::deque<std::deque<TensorView*>> tvChains(
   }
   return tv_chains;
 }
+
 } // namespace
 
 void ComputeAt::run(
     TensorView* producer,
     TensorView* consumer,
     unsigned int consumer_position) {
+  FUSER_PERF_SCOPE("ComputeAt::run");
+
   // Make sure the correct fusion is setup between this and consumer.
   TORCH_CHECK(
       producer->fusion() == consumer->fusion(),
@@ -160,6 +181,9 @@ void ComputeAt::run(
     // Check all dependency chains, select the next TV after producer towards
     // consumer. These are the TVs we're going to actually call computeAt on.
     for (const auto& tv_chain : all_chains) {
+      // When a chain only has two tensors, they must be the producer,
+      // which is an input, and the consumer. There is nothing we need
+      // to do for such chains.
       if (tv_chain.size() > 2) {
         // Make sure we only add once, but we want to add in a determinsitic
         // order
@@ -188,6 +212,8 @@ unsigned int ComputeAt::backwardComputeAt_impl(
     TensorView* producer,
     TensorView* consumer,
     unsigned int consumer_compute_at_axis) {
+  FUSER_PERF_SCOPE("backwardComputeAt_impl");
+
   auto& producer_entry = tv_data.at(producer);
 
   // Use TensorDomain interface so it doesn't set computeAt automatically
@@ -209,6 +235,8 @@ unsigned int ComputeAt::forwardComputeAt_impl(
     TensorView* producer,
     TensorView* consumer,
     unsigned int producer_compute_at_axis) {
+  FUSER_PERF_SCOPE("forwardComputeAt_impl");
+
   auto& consumer_entry = tv_data.at(consumer);
   const auto& producer_entry = tv_data.at(producer);
 
@@ -229,6 +257,8 @@ unsigned int ComputeAt::forwardComputeAt_impl(
 }
 
 void ComputeAt::setCommonConsumer() {
+  FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer");
+
   // Convert the first chain to a set.
   std::set<TensorView*> common_consumers(
       producer_use_chains_.front().begin(), producer_use_chains_.front().end());
@@ -281,6 +311,8 @@ void ComputeAt::setCommonConsumer() {
 // Similar to backward traversal in traverseAllKnown but we should only apply
 // computeAt if it will increase computeAt positions.
 void ComputeAt::traverseBackward() {
+  FUSER_PERF_SCOPE("ComputeAt::traverseBackward");
+
   // propagate *backward* through all *producer* use_chains or from *producer*
   // to common_consumer if common_consumer exists. Only apply transform if
   // increases computeAt position.
@@ -307,6 +339,8 @@ void ComputeAt::traverseBackward() {
 }
 
 void ComputeAt::traverseForward() {
+  FUSER_PERF_SCOPE("ComputeAt::traverseForward");
+
   // propagate forward through all *producer* use_chains or from *producer* to
   // common_consumer if common_consumer exists.
   auto chains = producer_use_chains_;
@@ -338,6 +372,8 @@ void ComputeAt::traverseForward() {
 }
 
 void ComputeAt::runPass() {
+  FUSER_PERF_SCOPE("ComputeAt::runPass");
+
   // Initialize tv_data for all TensorViews we may modify
   auto chains = producer_use_chains_;
   if (common_consumer_ != nullptr) {
@@ -382,6 +418,8 @@ void ComputeAt::runPass() {
 }
 
 void ComputeAt::setupOutputs() {
+  FUSER_PERF_SCOPE("ComputeAt::setupOutputs");
+
   if (common_consumer_ != nullptr)
     return;
 
@@ -421,9 +459,6 @@ ComputeAt::ComputeAt(
     : producer_(_producer),
       consumer_(_consumer),
       consumer_position_(_consumer_position) {
-  if (consumer_position_ < 0)
-    consumer_position_ += consumer_->nDims();
-
   TORCH_INTERNAL_ASSERT(
       consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(),
       "Invalid computeAt axis, received ",
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h
index 84677ae99448..a9112a6225ca 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at.h
@@ -56,9 +56,7 @@ class ComputeAtData {
   // If we set computeAt, save the domain so we can reset it after traversal.
   // Traversal state can deviate from the domain we will want to save after the
   // entire computeAt pass.
-  void setComputeAtDomain(TensorDomain* td) {
-    new_compute_at_domain_ = td;
-  }
+  void setComputeAtDomain(TensorDomain* td);
 
   // Return domain set in setComputeAtDomain
   TensorDomain* getComputeAtDomain() const {
diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore
new file mode 100644
index 000000000000..1936cc1d441e
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore
@@ -0,0 +1 @@
+html
diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h
new file mode 100644
index 000000000000..cfd4435461b9
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h
@@ -0,0 +1,23 @@
+
+#error This is used exclusively for generating the documentation (not a real header)
+
+//! \namespace torch::jit::fuser
+//! \brief Main PyTorch JIT Fuser namespace
+
+//! \namespace torch::jit::fuser::cuda
+//! \brief CUDA specific components
+
+//! \namespace torch::jit::fuser::cuda::executor_utils
+//! \brief Fuser executor related utilities
+
+//! \namespace torch::jit::fuser::kir
+//! \brief Kernel IR
+
+//! \namespace torch::jit::fuser::ir_utils
+//! \brief IR manipulation utilities
+
+//! \namespace torch::jit::fuser::loop_utils
+//! \brief Loop utilities
+
+//! \namespace torch::jit::fuser::scope_utils
+//! \brief Scope utilities
diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
new file mode 100644
index 000000000000..b9a51b187aa5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
@@ -0,0 +1,2515 @@
+# Doxyfile 1.8.14
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+
+PROJECT_NAME           = "PyTorch JIT Fuser"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = YES
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+
+# TODO: switch to NO once key concepts are documented
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  += ..
+INPUT                  += documentation.h
+INPUT                  += main_page.md
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                +=
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        += Ui
+EXCLUDE_SYMBOLS        += internal
+EXCLUDE_SYMBOLS        += __*
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = images
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = main_page.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = --std=c++1z
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: 0.
+
+CLANG_COMPILATION_DATABASE_PATH                                        = 0
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           += ..
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED  += PYTORCH_FUSER_DOXYGEN
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png b/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png
new file mode 100644
index 000000000000..48616c381bc5
Binary files /dev/null and b/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png differ
diff --git a/torch/csrc/jit/codegen/cuda/docs/main_page.md b/torch/csrc/jit/codegen/cuda/docs/main_page.md
new file mode 100644
index 000000000000..7464f577fe00
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/main_page.md
@@ -0,0 +1,8 @@
+
+This is the implementation reference for the CUDA PyTorch JIT Fuser
+
+- [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
+- [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
+- Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
+
+![Fuser Architecture Overview](images/ir_architecture.png)
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index a546ee5cf2f6..a0df3c784778 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1,12 +1,17 @@
 
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
 #include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
@@ -29,13 +34,43 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
     std::cout << "\n==== codegen output for kernel: " << kernelName()
               << " ====" << std::endl
               << code << std::endl
-              << "=====*===============================" << std::endl;
+              << "======================================\n"
+              << std::endl;
   }
 
   return code;
 }
 
+void FusionExecutor::debugCompileFusionFromStr(
+    Fusion* fusion,
+    const std::string& code,
+    const std::string& name,
+    int id,
+    CompileOptions options) {
+  fusion_ = *fusion;
+  FusionGuard fg(&fusion_);
+  options_ = options;
+
+  const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
+  if (debug_env && atoi(debug_env)) {
+    std::cout << "\n==== codegen output for kernel: " << kernelName()
+              << " ====" << std::endl
+              << code << std::endl
+              << "======================================\n"
+              << std::endl;
+  }
+
+  fusion_id_ = id;
+  lowered_ = GpuLower(&fusion_);
+
+  compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_);
+  TORCH_INTERNAL_ASSERT(
+      fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted.");
+}
+
 void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
+  FUSER_PERF_SCOPE("compileFusion");
+
   TORCH_INTERNAL_ASSERT(
       !fusion->outputs().empty(), "No output found for this kernel, aborting.");
 
@@ -45,40 +80,65 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
         "Output types from fusions that are not tensors are not supported at this point.");
   }
 
+  // Clone the fusion so we can store it
   fusion_ = *fusion;
   FusionGuard fg(&fusion_);
   options_ = options;
 
+  TORCH_INTERNAL_ASSERT(
+      options.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
+  max_device_smem =
+      at::cuda::getDeviceProperties(options.device.index())->sharedMemPerBlock;
+
+  setUsedTVs();
+
   fusion_id_ = ++fusion_id_counter_;
-  has_random_ = fusion->hasRNG();
   lowered_ = GpuLower(&fusion_);
-  const auto kernel = lowered_.getKernel(kernelName());
-  const auto structured_code = getStructuredCode(kernel);
+  const auto kernel = lowered_.kernel();
+  const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName());
+  const auto structured_code = getStructuredCode(kernel_code);
+
+  const auto& kernel_summary = kernel->summary();
+  has_block_reductions = kernel_summary.has_block_reductions;
+  has_grid_reductions = kernel_summary.has_grid_reductions;
+  has_block_broadcasts = kernel_summary.has_block_broadcasts;
+
+  if (!kernel_summary.static_smem_allocations.empty()) {
+    StatefulExpressionEvaluator static_evaluator(&fusion_);
+    unsigned static_smem_size = computeSharedMemory(
+        static_evaluator, kernel_summary.static_smem_allocations);
+    TORCH_INTERNAL_ASSERT(
+        static_smem_size < max_device_smem,
+        "The static shared memory allocation is larger than available memory.");
+  }
 
   compiled_kernel_ = executor_utils::nvrtcCompile(
       structured_code,
       (kernelNamespace() + "::" + kernelName()).c_str(),
       fusion_id_);
-  compiled_ = true;
+  TORCH_INTERNAL_ASSERT(
+      fusion_id_ > 0, "failed to assign a fusion_id_ after compilation.");
 }
 
 namespace {
 
 at::Tensor inferAndAlloc(
     const TensorView* tv,
-    EvaluationContext& ec,
+    StatefulExpressionEvaluator& see,
     const CompileOptions& options,
     bool zero_init = false) {
+  FUSER_PERF_SCOPE("inferAndAlloc");
+
   std::vector<int64_t> sizes;
-  for (auto id : TensorDomain::noReductions(tv->getRootDomain())) {
-    auto infered_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec);
+  for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) {
+    auto inferred_val = see.inferValue(id->rawExtent());
     TORCH_INTERNAL_ASSERT(
-        infered_val.has_value(),
+        inferred_val.has_value(),
         "Could not launch kernel as program could not infer ",
         id->rawExtent(),
         " for the buffer ",
         tv);
-    sizes.push_back(infered_val.value());
+    sizes.push_back(inferred_val.value());
   }
 
   auto at_type = data_type_to_aten(tv->getDataType().value());
@@ -90,38 +150,60 @@ at::Tensor inferAndAlloc(
     return at::zeros(isizes, tensor_options);
   } else {
     c10::IntArrayRef isizes(sizes);
-    return at::empty(isizes, tensor_options);
+    // Non Variable type guard for empty_cuda call
+    at::AutoNonVariableTypeMode non_variable_type_mode;
+    return at::native::empty_cuda(isizes, tensor_options);
   }
 }
 
 } // namespace
 
+uint64_t FusionExecutor::computeSharedMemory(
+    StatefulExpressionEvaluator& see,
+    const std::vector<kir::Allocate*>& buffers,
+    bool align_padding,
+    uint64_t total) {
+  FUSER_PERF_SCOPE("computeSharedMemory");
+  for (auto smem_alloc : buffers) {
+    auto inferred_val = see.inferValue(smem_alloc->size());
+    if (inferred_val.has_value()) {
+      const uint64_t data_size = dataTypeSize(smem_alloc->buffer_type());
+      // Add padding to align dynamic shared memory
+      if (align_padding) {
+        total = ceilDiv(total, data_size) * data_size;
+      }
+      total += inferred_val.value() * data_size;
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Failed to evaluate the size ",
+          smem_alloc->size(),
+          " of shared memory buffer - T",
+          smem_alloc->buffer()->name());
+    }
+  }
+  return total;
+}
+
 LaunchParams FusionExecutor::computeLaunchParams(
-    const at::ArrayRef<IValue>& aten_inputs,
     const LaunchParams& launch_constraints,
-    EvaluationContext& ec) {
-  LaunchParams launch_params;
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("computeLaunchParams");
 
-  // Grab all values that are actually used in the fusion
-  auto unordered_vals = DependencyCheck::getAllValsBetween(
-      {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs());
+  LaunchParams launch_params;
 
   // Lets collect all IterDomains that are bound to a thread binding
   std::unordered_map<ParallelType, std::vector<IterDomain*>, TypeHash>
       parallel_iter_domains;
-
-  for (auto val : unordered_vals) {
-    if (val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = val->as<TensorView>();
-      for (auto id : tv->domain()->domain()) {
-        if (id->isThread() && !id->isBroadcast()) {
-          if (parallel_iter_domains.find(id->getParallelType()) !=
-              parallel_iter_domains.end()) {
-            parallel_iter_domains.at(id->getParallelType()).push_back(id);
-          } else {
-            parallel_iter_domains[id->getParallelType()] =
-                std::vector<IterDomain*>({id});
-          }
+  for (auto tv : getUsedTVs()) {
+    for (auto id : tv->domain()->domain()) {
+      if (id->isThread() && !id->isBroadcast()) {
+        if (parallel_iter_domains.find(id->getParallelType()) !=
+            parallel_iter_domains.end()) {
+          parallel_iter_domains.at(id->getParallelType()).push_back(id);
+        } else {
+          parallel_iter_domains[id->getParallelType()] =
+              std::vector<IterDomain*>({id});
         }
       }
     }
@@ -129,32 +211,31 @@ LaunchParams FusionExecutor::computeLaunchParams(
 
   // If any dimension was set in launch constraints we need to run through
   // IterDomains that have been parallelized, and bind those values. Or make
-  // sure if they could be infered the inference matches what was set.
+  // sure if they could be inferred the inference matches what was set.
   if (launch_constraints.nBlocks() * launch_constraints.nThreads() != -1) {
     for (auto& entry : parallel_iter_domains) {
       auto p_type = entry.first;
       if (launch_constraints.hasDim(p_type)) {
         auto parallel_ids = entry.second;
         for (auto parallel_id : parallel_ids) {
-          auto infered_val =
-              ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec);
-          if (infered_val.has_value()) {
-            // This value could have been infered, make sure it was set right.
+          auto inferred_val = see.inferValue(parallel_id->rawExtent());
+          if (inferred_val.has_value()) {
+            // This value could have been inferred, make sure it was set right.
             TORCH_CHECK(
-                infered_val.value() == launch_constraints.getDim(p_type) ||
+                inferred_val.value() == launch_constraints.getDim(p_type) ||
                     launch_constraints.getRawVal(p_type) == -1,
-                "Infered that ",
+                "inferred that ",
                 p_type,
                 " should be set to ",
-                infered_val.value(),
+                inferred_val.value(),
                 " but launch constraints specified ",
                 launch_constraints.getDim(p_type));
           } else {
             // Bind the launch constraint into our evaluation context
-            executor_utils::safeBind(
-                ec,
+            see.safeBind(
                 parallel_id->rawExtent(),
-                launch_constraints.getDim(entry.first));
+                launch_constraints.getDim(entry.first),
+                &lowered_);
             launch_params.bind(launch_constraints.getDim(p_type), p_type);
           }
         }
@@ -167,7 +248,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
     auto p_type = entry.first;
     auto parallel_ids = entry.second;
     for (auto parallel_id : parallel_ids) {
-      auto val = ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec);
+      auto val = see.inferValue(parallel_id->rawExtent());
       TORCH_INTERNAL_ASSERT(
           val,
           "Tried to evaluate the extent of ",
@@ -177,100 +258,225 @@ LaunchParams FusionExecutor::computeLaunchParams(
     }
   }
 
+  const auto kernel = lowered_.kernel();
+  const auto& kernel_summary = kernel->summary();
+
+  // Calculate Dynamic Shared Memory Size
+  // Add workspace for reduction and broadcast
+  uint64_t reduction_broadcast_workspace = 0;
+  if (has_block_reductions || has_grid_reductions || has_block_broadcasts) {
+    // Not using nThreads here since it does not handle uninitialized value
+    reduction_broadcast_workspace =
+        dataTypeSize(kernel_summary.largest_smem_data_type) *
+        launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz();
+  }
+
+  const uint64_t dynamic_smem_size = computeSharedMemory(
+      see,
+      kernel_summary.dynamic_smem_allocations,
+      true,
+      reduction_broadcast_workspace);
+
+  const uint64_t static_smem_size =
+      computeSharedMemory(see, kernel_summary.static_smem_allocations);
+
+  TORCH_INTERNAL_ASSERT(
+      (dynamic_smem_size + static_smem_size) < max_device_smem,
+      "The total shared memory allocation is larger than available memory.");
+  launch_params.setSmem(dynamic_smem_size);
+
   return launch_params;
 }
 
-std::vector<at::Tensor> FusionExecutor::allocGlobalVals(EvaluationContext& ec) {
-  std::vector<at::Tensor> global_buffers;
-  for (auto alloc : lowered_.global_allocations()) {
-    TORCH_INTERNAL_ASSERT(
-        alloc->buffer()->getValType() == ValType::KirTensorView,
-        "Cannot allocate global buffers that are not tensors.");
-    global_buffers.push_back(inferAndAlloc(
-        alloc->buffer()->as<kir::TensorView>()->fuserTv(),
-        ec,
-        options_,
-        false));
-  }
-
-  for (auto alloc : lowered_.sync_allocations()) {
+FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("allocGlobalVals");
+  GlobalBuffers global_buffers;
+  const auto& kernel_summary = lowered_.kernel()->summary();
+  for (auto alloc : kernel_summary.global_allocations) {
     TORCH_INTERNAL_ASSERT(
         alloc->buffer()->getValType() == ValType::KirTensorView,
         "Cannot allocate global buffers that are not tensors.");
-    global_buffers.push_back(inferAndAlloc(
-        alloc->buffer()->as<kir::TensorView>()->fuserTv(), ec, options_, true));
+    if (!alloc->zeroInit()) {
+      global_buffers.empty_buffers.push_back(inferAndAlloc(
+          alloc->buffer()->as<kir::TensorView>()->fuserTv(),
+          see,
+          options_,
+          false));
+    } else {
+      global_buffers.zero_buffers.push_back(inferAndAlloc(
+          alloc->buffer()->as<kir::TensorView>()->fuserTv(),
+          see,
+          options_,
+          true));
+    }
   }
 
   return global_buffers;
 }
 
-std::vector<at::Tensor> FusionExecutor::allocOutputs(EvaluationContext& ec) {
+std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("allocOutputs");
   std::vector<at::Tensor> outputs;
   for (auto output : fusion_.outputs()) {
     TORCH_INTERNAL_ASSERT(
         output->getValType() == ValType::TensorView,
         "Cannot allocate outputs that are not tensors.");
     outputs.push_back(
-        inferAndAlloc(output->as<TensorView>(), ec, options_, false));
+        inferAndAlloc(output->as<TensorView>(), see, options_, false));
   }
   return outputs;
 }
 
+void FusionExecutor::setUsedTVs() {
+  used_tvs_.clear();
+  auto used_vals = DependencyCheck::getAllValsBetween(
+      {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs());
+  for (auto val : used_vals) {
+    if (val->getValType().value() == ValType::TensorView) {
+      used_tvs_.push_back(val->as<TensorView>());
+    }
+  }
+}
+
 std::vector<at::Tensor> FusionExecutor::runFusion(
     const at::ArrayRef<IValue>& inputs,
     const std::vector<at::Tensor>& outputs,
-    const LaunchParams& launch_constraints) {
+    const LaunchParams& launch_constraints,
+    const c10::optional<size_t>& opt_code) {
+  FUSER_PERF_SCOPE("runFusion");
+
   TORCH_INTERNAL_ASSERT(
       fusion_id_ > 0, "Cannot run fusion, it was not compiled.");
+  TORCH_INTERNAL_ASSERT(
+      !opt_code.has_value() || outputs.empty(),
+      "short cut input cache is not compatible with pre-allocated output");
 
-  FusionGuard fg(&fusion_);
-
-  executor_utils::validateKernelInputs(&fusion_, inputs, options_.device);
+  ExecutorEntry* executor_entry = nullptr;
+  if (opt_code.has_value()) {
+    executor_entry = &executor_entry_lookup_[*opt_code];
+  }
 
+  FusionGuard fg(&fusion_);
   c10::DeviceGuard dg(options_.device);
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  EvaluationContext evaluation_context =
-      executor_utils::bindInputs(inputs, &fusion_);
+  LaunchParams launch_params;
+  std::vector<at::Tensor> alloced_outputs = outputs;
+  GlobalBuffers global_buffers;
+  uint64_t rand_offset = 0;
+
+  if (executor_entry && executor_entry->init) {
+    {
+      // context manager to disable auto grad for `empty_cuda` calls later;
+      at::AutoNonVariableTypeMode non_variable_type_mode;
+      // take the short-cut for launch if we see a recorded input set again;
+      launch_params = executor_entry->launch_params;
+      for (size_t i = 0; i < executor_entry->output_sizes.size(); i++) {
+        auto tensor_options = at::TensorOptions()
+                                  .dtype(executor_entry->output_types[i])
+                                  .device(options_.device);
+        alloced_outputs.push_back(at::native::empty_cuda(
+            executor_entry->output_sizes[i], tensor_options));
+      }
+      for (size_t i = 0; i < executor_entry->empty_buffer_sizes.size(); i++) {
+        auto tensor_options = at::TensorOptions()
+                                  .dtype(executor_entry->empty_buffer_types[i])
+                                  .device(options_.device);
+        global_buffers.empty_buffers.push_back(at::native::empty_cuda(
+            executor_entry->empty_buffer_sizes[i], tensor_options));
+      }
+    }
+    for (size_t i = 0; i < executor_entry->zero_buffer_sizes.size(); i++) {
+      auto tensor_options = at::TensorOptions()
+                                .dtype(executor_entry->zero_buffer_types[i])
+                                .device(options_.device);
+      global_buffers.zero_buffers.push_back(
+          at::zeros(executor_entry->zero_buffer_sizes[i], tensor_options));
+    }
+    rand_offset = executor_entry->rand_offset;
+  } else {
+    // code path to take when either:
+    //   1. no opt_code is provided or;
+    //   2. `executor_entry` is not initialized
+    executor_utils::validateKernelInputs(&fusion_, inputs, options_.device);
 
-  LaunchParams launch_params =
-      computeLaunchParams(inputs, launch_constraints, evaluation_context);
+    StatefulExpressionEvaluator evaluator =
+        executor_utils::statefulBindInputs(inputs, &fusion_, &lowered_);
 
-  std::vector<at::Tensor> alloced_outputs = outputs;
-  if (outputs.empty() || outputs.size() != fusion_.outputs().size()) {
-    alloced_outputs = allocOutputs(evaluation_context);
-  }
+    launch_params = computeLaunchParams(launch_constraints, evaluator);
 
-  executor_utils::validateKernelOutputs(
-      &fusion_, alloced_outputs, options_.device);
+    if (outputs.empty() || outputs.size() != fusion_.outputs().size()) {
+      alloced_outputs = allocOutputs(evaluator);
+    } else {
+      executor_utils::validateKernelOutputs(
+          &fusion_, alloced_outputs, options_.device);
+    }
+
+    global_buffers = allocGlobalVals(evaluator);
+
+    if (lowered_.kernel()->summary().is_stochastic) {
+      // NOTE: this is how we map offset to PW kernels in order to have
+      // identical random number generator to match native PyTorch results.
+      // But it doesn't really work as it takes assumption how threads are
+      // binded but is not generally how we handle that in scheduler.
+      // Refer to `Philox` in generated kernel to understand how the mapping
+      // works.
+      rand_offset = 4 *
+          (std::ceil(
+               alloced_outputs[0].numel() /
+               (4.0 * 128 * launch_params.gdimx())) + // NOLINT
+           1);
+    }
+
+    // This is the entry when we have provided `opt_code` but the entry has not
+    // been initialized yet.
+    if (executor_entry) {
+      // record the the short-cut executor entry for the given input set;
+      executor_entry->launch_params = launch_params;
+      for (const auto& output : alloced_outputs) {
+        executor_entry->output_sizes.push_back(output.sizes().vec());
+        executor_entry->output_types.push_back(output.scalar_type());
+      }
+      for (const auto& buffer : global_buffers.empty_buffers) {
+        executor_entry->empty_buffer_sizes.push_back(buffer.sizes().vec());
+        executor_entry->empty_buffer_types.push_back(buffer.scalar_type());
+      }
+      for (const auto& buffer : global_buffers.zero_buffers) {
+        executor_entry->zero_buffer_sizes.push_back(buffer.sizes().vec());
+        executor_entry->zero_buffer_types.push_back(buffer.scalar_type());
+      }
+      executor_entry->rand_offset = rand_offset;
+      executor_entry->init = true;
+    }
+  }
 
   KernelArgumentHolder kernel_arguments;
   kernel_arguments.push(inputs);
   kernel_arguments.push(alloced_outputs);
-  auto buffers = allocGlobalVals(evaluation_context);
-  kernel_arguments.push(buffers);
-
-  if (has_random_) {
-    const auto rand_offset = 4 *
-        (std::ceil(
-             alloced_outputs[0].numel() / (4.0 * 128 * launch_params.gdimx())) +
-         1);
+  kernel_arguments.push(global_buffers.empty_buffers);
+  kernel_arguments.push(global_buffers.zero_buffers);
+  if (lowered_.kernel()->summary().is_stochastic) {
     kernel_arguments.appendPhiloxRNGSeed(rand_offset);
   }
 
-  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
-      compiled_kernel_.function,
-      launch_params.gdimx(),
-      launch_params.gdimy(),
-      launch_params.gdimz(),
-      launch_params.bdimx(),
-      launch_params.bdimy(),
-      launch_params.bdimz(),
-      0, // smem
-      stream,
-      kernel_arguments.getBuffer(),
-      nullptr));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  {
+    FUSER_PERF_SCOPE("cuLaunchKernel");
+    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
+        compiled_kernel_.function,
+        launch_params.gdimx(),
+        launch_params.gdimy(),
+        launch_params.gdimz(),
+        launch_params.bdimx(),
+        launch_params.bdimy(),
+        launch_params.bdimz(),
+        launch_params.smem(),
+        stream,
+        kernel_arguments.getBuffer(),
+        nullptr));
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 
   return alloced_outputs;
 }
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
index 10e71827a37b..ad6a1f643296 100644
--- a/torch/csrc/jit/codegen/cuda/executor.h
+++ b/torch/csrc/jit/codegen/cuda/executor.h
@@ -23,26 +23,66 @@ struct TORCH_CUDA_API CompileOptions {
 
 class TORCH_CUDA_API FusionExecutor : public NonCopyable {
  public:
+  // Unsafe compilation that's useful for debugging kernels, iterating over
+  // slight modifications of a generated kernel
+  void debugCompileFusionFromStr(
+      Fusion* fusion,
+      const std::string& code,
+      const std::string& name,
+      int id,
+      CompileOptions options = CompileOptions());
+
   void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions());
 
   std::vector<at::Tensor> runFusion(
       const at::ArrayRef<IValue>& inputs,
       const std::vector<at::Tensor>& outputs,
-      const LaunchParams& launch_constraints = LaunchParams());
+      const LaunchParams& launch_constraints = LaunchParams(),
+      const c10::optional<size_t>& opt_code = c10::nullopt);
 
   std::vector<at::Tensor> runFusion(
       const at::ArrayRef<IValue>& inputs,
-      const LaunchParams& launch_constraints = LaunchParams()) {
-    return runFusion(inputs, {}, launch_constraints);
+      const LaunchParams& launch_constraints = LaunchParams(),
+      const c10::optional<size_t>& opt_code = c10::nullopt) {
+    return runFusion(inputs, {}, launch_constraints, opt_code);
   }
 
   // function to query whether a `FusionExecutor` has a compiled kernel to
   // execute
   bool compiled() const {
-    return compiled_;
+    return fusion_id_ != -1;
+  };
+
+  void evictCache(size_t cache_id) {
+    executor_entry_lookup_.erase(cache_id);
+  }
+
+  // TODO: strides would also be important when we handle permutations in
+  //       codegen.
+  // struct used to hold necessary information to launch compiled kernel on a
+  // given input set.
+  struct ExecutorEntry {
+    bool init = false;
+    LaunchParams launch_params;
+    std::vector<std::vector<int64_t>> output_sizes;
+    std::vector<at::ScalarType> output_types;
+    std::vector<std::vector<int64_t>> empty_buffer_sizes;
+    std::vector<at::ScalarType> empty_buffer_types;
+    std::vector<std::vector<int64_t>> zero_buffer_sizes;
+    std::vector<at::ScalarType> zero_buffer_types;
+    uint64_t rand_offset;
   };
 
+  Kernel* kernel() const {
+    return lowered_.kernel();
+  }
+
  private:
+  struct GlobalBuffers {
+    std::vector<at::Tensor> empty_buffers;
+    std::vector<at::Tensor> zero_buffers;
+  };
+
   std::string kernelName() const {
     std::stringstream ss;
     ss << "kernel" << fusion_id_;
@@ -57,31 +97,51 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable {
   std::string getStructuredCode(const std::string& kernel);
 
   LaunchParams computeLaunchParams(
-      const at::ArrayRef<IValue>& aten_inputs,
       const LaunchParams& launch_constraints,
-      EvaluationContext& ec);
+      StatefulExpressionEvaluator& see);
 
-  std::vector<at::Tensor> allocGlobalVals(EvaluationContext& ec);
+  uint64_t computeSharedMemory(
+      StatefulExpressionEvaluator& see,
+      const std::vector<kir::Allocate*>& buffers,
+      bool align_padding = false,
+      uint64_t total = 0);
 
-  std::vector<at::Tensor> allocOutputs(EvaluationContext& ec);
+  // return a pair of vector of tensors, where tensors in the first vector are
+  // not initialized, while the second vector contains zero-initiliazed tensors
+  GlobalBuffers allocGlobalVals(StatefulExpressionEvaluator& see);
 
- private:
-  bool compiled_ = false;
+  std::vector<at::Tensor> allocOutputs(StatefulExpressionEvaluator& see);
+
+  void setUsedTVs();
+
+  const std::vector<TensorView*>& getUsedTVs() const {
+    return used_tvs_;
+  };
 
+ private:
   Fusion fusion_;
 
-  CompileOptions options_;
+  // TODO(kir): caching the values here is no longer needed
+  bool has_block_reductions = false;
+  bool has_grid_reductions = false;
+  bool has_block_broadcasts = false;
 
+  CompileOptions options_;
+  size_t max_device_smem = std::numeric_limits<size_t>().max();
   executor_utils::NvrtcFunction compiled_kernel_;
 
-  // State of the fusion that's important
-  bool has_random_ = false;
+  // TensorViews actually used in the kernel.
+  std::vector<TensorView*> used_tvs_;
 
   // Counter to be used for kernel name.
   int fusion_id_ = -1;
   static int fusion_id_counter_;
 
   GpuLower lowered_;
+
+  // lookup table to take short cut to retrieve recorded information in order to
+  // launch kernels without re-inference parameters.
+  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
index 1f3f44dbf551..76358eb7868f 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
@@ -58,7 +58,7 @@ void KernelArgumentHolder::push(const IValue& val) {
       arguments_.push_back(std::make_unique<FloatArg>((float)val.toDouble()));
       return;
     case c10::ScalarType::Long:
-      arguments_.push_back(std::make_unique<IntArg>((int)val.toInt()));
+      arguments_.push_back(std::make_unique<LongArg>(val.toInt()));
       return;
     default:
       TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
index ca9a83c60a56..44d0eeacc7df 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
@@ -61,6 +61,14 @@ struct ULongArg : public ArgAbstract {
   }
 };
 
+struct LongArg : public ArgAbstract {
+  int64_t val_;
+  LongArg(int64_t _val) : val_(_val){};
+  void* arg() {
+    return &val_;
+  }
+};
+
 struct IntArg : public ArgAbstract {
   int val_;
   IntArg(int _val) : val_(_val){};
diff --git a/torch/csrc/jit/codegen/cuda/executor_launch_params.h b/torch/csrc/jit/codegen/cuda/executor_launch_params.h
index 872fa2d06b86..981352e4839b 100644
--- a/torch/csrc/jit/codegen/cuda/executor_launch_params.h
+++ b/torch/csrc/jit/codegen/cuda/executor_launch_params.h
@@ -24,9 +24,14 @@ class TORCH_CUDA_API LaunchParams {
         bdimy_(bdimy),
         bdimz_(bdimz) {}
 
+  void setSmem(int64_t smem) {
+    smem_ = smem;
+  }
+
   int64_t smem() const {
     return smem_;
   }
+
   int64_t nBlocks() const {
     return gdimx_ * gdimy_ * gdimz_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index c32538070a60..af4e127cc548 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -1,14 +1,14 @@
 #include <ATen/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
-#include <torch/csrc/jit/resource_guard.h>
-
+#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_resource_strings.h>
-
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
+#include <torch/csrc/jit/resource_guard.h>
 
 #include <fstream>
 
@@ -32,14 +32,16 @@ std::string kernelPreamble() {
 
 namespace {
 
+// return false if arg's type, number of dimensions, and device, doesn't match
+// param and provided c10:device
 bool validateKernelArgTensor(
     const at::Tensor& arg,
     const Val* param,
-    c10::Device device,
+    const c10::Device& device,
     std::stringstream& msg) {
   // Arg is a tensor. Param must be a tensor too.
   if (*param->getValType() != ValType::TensorView) {
-    msg << "Argument is a tensor, but the parameter is not.";
+    msg << "Argument is a tensor, but the parameter is not.\n";
     return false;
   }
 
@@ -54,12 +56,13 @@ bool validateKernelArgTensor(
   // check as necessary.
   if (arg_dim > param_dim) {
     msg << "Argument tensor's rank is " << arg_dim << ", but the parameter is "
-        << param_dim;
+        << param_dim << "\n";
     return false;
   }
 
   if (arg.device() != device) {
-    msg << "Argument is on device that is not compiled for";
+    msg << "Argument is on device that is not compiled for."
+        << "\n";
     return false;
   }
   // Check element type
@@ -77,22 +80,24 @@ bool validateKernelArgTensor(
       match = param_data_type == DataType::Bool;
       break;
     default:
-      msg << "Argument element type, " << arg_data_type
-          << ", is not supported.";
+      msg << "Argument element type, " << arg_data_type << ", is not supported."
+          << "\n";
       return false;
   }
   if (!match)
     msg << "Argument element type is " << arg_data_type
-        << ", but the parameter is " << param_data_type;
+        << ", but the parameter is " << param_data_type << "\n";
   return match;
 }
 
+// Return false if  arg_type doesn't match the type in param
 bool validateKernelArgScalar(
     const c10::TypePtr& arg_type,
     const Val* param,
     std::stringstream& msg) {
   if (!param->isScalar()) {
-    msg << "Argument is a scalar, but the parameter is not.";
+    msg << "Argument is a scalar, but the parameter is not."
+        << "\n";
     return false;
   }
   DataType param_type = *param->getDataType();
@@ -112,20 +117,22 @@ bool validateKernelArgScalar(
   }
   if (!match) {
     msg << "Argument type is " << *arg_type << ", but the parameter is "
-        << param_type;
+        << param_type << "\n";
   }
   return match;
 }
 
+// Return false if arg and param don't match up and if arg's device (if a
+// tensor) doesn't match provided device
 bool validateKernelArg(
     const c10::IValue& arg,
     const Val* param,
-    c10::Device device,
+    const c10::Device& device,
     std::stringstream& msg) {
-  if (arg.type()->kind() != c10::TypeKind::TensorType) {
-    return validateKernelArgScalar(arg.type(), param, msg);
-  } else {
+  if (arg.isTensor()) {
     return validateKernelArgTensor(arg.toTensor(), param, device, msg);
+  } else {
+    return validateKernelArgScalar(arg.type(), param, msg);
   }
 }
 
@@ -134,30 +141,33 @@ bool validateKernelArg(
 void validateKernelInputs(
     Fusion* fusion,
     const at::ArrayRef<IValue>& inputs,
-    c10::Device device) {
+    const c10::Device& device) {
+  FUSER_PERF_SCOPE("validateKernelInputs");
+
   // This is necessary as we were traversing the fusion graph later in the check
   FusionGuard fg(fusion);
   // Check inputs
   TORCH_INTERNAL_ASSERT(
       inputs.size() == fusion->inputs().size(),
       "Wrong number of kernel inputs.");
+
+  std::stringstream msg;
+  bool mismatch = false;
   for (size_t i = 0; i < inputs.size(); ++i) {
     const IValue& arg = inputs[i];
     const Val* param = fusion->inputs()[i];
-    std::stringstream msg;
-    TORCH_INTERNAL_ASSERT(
-        validateKernelArg(arg, param, device, msg),
-        "Input argument at position ",
-        i,
-        " is invalid; ",
-        msg.str());
+    mismatch = !validateKernelArg(arg, param, device, msg) || mismatch;
   }
+  TORCH_INTERNAL_ASSERT(
+      !mismatch, "Found one or more invalid arguments: ", msg.str());
 }
 
 void validateKernelOutputs(
     Fusion* fusion,
     const std::vector<at::Tensor>& outputs,
-    c10::Device device) {
+    const c10::Device& device) {
+  FUSER_PERF_SCOPE("validateKernelOutputs");
+
   TORCH_INTERNAL_ASSERT(
       fusion->outputs().size() != 0,
       "Kernel should have at least one output tensor.");
@@ -165,47 +175,30 @@ void validateKernelOutputs(
   TORCH_INTERNAL_ASSERT(
       outputs.size() == fusion->outputs().size(),
       "Wrong number of kernel outputs.");
+
+  std::stringstream msg;
+  bool mismatch = false;
   for (size_t i = 0; i < outputs.size(); ++i) {
     const at::Tensor& arg = outputs[i];
     const Val* param = fusion->outputs()[i];
-    std::stringstream msg;
-    TORCH_INTERNAL_ASSERT(
-        validateKernelArgTensor(arg, param, device, msg),
-        "Output argument at position ",
-        i,
-        " is invalid; ",
-        msg.str());
-  }
-}
-
-void safeBind(
-    EvaluationContext& ec,
-    const Val* value,
-    Int::ScalarType concrete_value) {
-  auto already_concrete_val = ec.concreteValue(value);
-
-  if (already_concrete_val.has_value()) {
-    TORCH_INTERNAL_ASSERT(
-        concrete_value == already_concrete_val.value(),
-        "Tried to bind ",
-        value,
-        " to ",
-        " concrete value, but it's already set to ",
-        already_concrete_val.value());
-  } else {
-    ec.bind(value, concrete_value);
+    mismatch = !validateKernelArg(arg, param, device, msg) || mismatch;
   }
+  TORCH_INTERNAL_ASSERT(
+      !mismatch, "Found one or more invalid arguments: ", msg.str());
 }
 
-EvaluationContext bindInputs(
+StatefulExpressionEvaluator statefulBindInputs(
     const at::ArrayRef<IValue>& aten_inputs,
-    Fusion* fusion) {
+    Fusion* fusion,
+    GpuLower* lower) {
+  FUSER_PERF_SCOPE("statefulBindInputs");
+
   TORCH_INTERNAL_ASSERT(
       fusion->inputs().size() == aten_inputs.size(),
       "Something went wrong configuring launch. Inputs no longer match.");
 
   auto fusion_inputs = fusion->inputs();
-  EvaluationContext eval_context(fusion);
+  StatefulExpressionEvaluator evaluator(fusion);
 
   // This should probably move to EvaluationContext as we may want to bind
   // input values frequently. Bind fusion input values to runtime values.
@@ -224,18 +217,26 @@ EvaluationContext bindInputs(
           "Something went wrong configuring launch. Inputs no longer match.");
 
       for (size_t dim = 0; dim < root_dom.size(); dim++) {
-        safeBind(
-            eval_context, root_dom[dim]->extent(), aten_tensor.sizes()[dim]);
+        evaluator.safeBind(
+            root_dom[dim]->extent(), aten_tensor.sizes()[dim], lower);
       }
+    } else if (
+        fusion->inputs()[i]->getValType().value() == ValType::Scalar &&
+        fusion->inputs()[i]->getDataType().value() == DataType::Int) {
+      TORCH_INTERNAL_ASSERT(
+          aten_inputs[i].type()->kind() == c10::TypeKind::IntType);
+      evaluator.safeBind(fusion->inputs()[i], aten_inputs[i].toInt(), lower);
     }
   }
-  return eval_context;
+  return evaluator;
 }
 
 NvrtcFunction nvrtcCompile(
     const std::string& code,
     const std::string& func_name,
     int id) {
+  FUSER_PERF_SCOPE("NVRTC");
+
   // lazily construct context if non-existing yet;
   CUcontext pctx = nullptr;
   AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx));
@@ -258,44 +259,87 @@ NvrtcFunction nvrtcCompile(
   const int major = prop->major;
   const int minor = prop->minor;
   nvrtcProgram program;
-  AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
-      &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+
+  {
+    FUSER_PERF_SCOPE("nvrtcCreateProgram");
+    AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
+        &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+  }
+
   ResourceGuard holdProgram([&] {
+    FUSER_PERF_SCOPE("nvrtcDestroyProgram");
     AT_CUDA_NVRTC_CHECK(
         at::globalContext().getNVRTC().nvrtcDestroyProgram(&program));
   });
 
   const std::string compute = "--gpu-architecture=compute_" +
       std::to_string(major) + std::to_string(minor);
-  const std::vector<const char*> args = {
+  std::vector<const char*> args = {
       "--std=c++14", compute.c_str(), "-default-device"};
 
+  const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA");
+  // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0;
+  if (disable_fma && atoi(disable_fma)) {
+    args.push_back("--fmad=false");
+  }
+
+  const char* ptxas_opt_level = getenv("PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL");
+  uint32_t jit_opt_level;
+
+  std::vector<CUjit_option> options;
+  std::vector<void*> option_vals;
+
+  if (ptxas_opt_level) {
+    int val = atoi(ptxas_opt_level);
+    if (val <= 4 && val >= 0) {
+      jit_opt_level = static_cast<uint32_t>(val);
+      options.push_back(CU_JIT_OPTIMIZATION_LEVEL);
+      option_vals.emplace_back(&jit_opt_level);
+    } else {
+      TORCH_WARN_ONCE(
+          "acceptable range for PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL is between 0 and 4, but received ",
+          jit_opt_level,
+          ", ignoring the option");
+    }
+  }
+
   at::globalContext().getNVRTC().nvrtcAddNameExpression(
       program, func_name.c_str());
-  const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram(
-      program, args.size(), args.data());
 
-  if (result != NVRTC_SUCCESS) {
-    size_t logsize;
-    at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
-    std::vector<char> log(logsize);
-    at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+  {
+    FUSER_PERF_SCOPE("nvrtcCompileProgram");
+
+    const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram(
+        program, args.size(), args.data());
+
+    if (result != NVRTC_SUCCESS) {
+      size_t logsize;
+      at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
+      std::vector<char> log(logsize);
+      at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+
+      TORCH_INTERNAL_ASSERT(
+          false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data());
+    }
 
-    TORCH_INTERNAL_ASSERT(
-        false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data());
+    AT_CUDA_NVRTC_CHECK(result);
   }
-  const char* lowered_kernel_name;
+
+  const char* lowered_kernel_name = nullptr;
   at::globalContext().getNVRTC().nvrtcGetLoweredName(
       program, func_name.c_str(), &lowered_kernel_name);
 
-  AT_CUDA_NVRTC_CHECK(result);
-  size_t ptx_size;
-  AT_CUDA_NVRTC_CHECK(
-      at::globalContext().getNVRTC().nvrtcGetPTXSize(program, &ptx_size));
+  size_t ptx_size = 0;
   std::vector<char> ptx;
-  ptx.resize(ptx_size);
-  AT_CUDA_NVRTC_CHECK(
-      at::globalContext().getNVRTC().nvrtcGetPTX(program, ptx.data()));
+
+  {
+    FUSER_PERF_SCOPE("get PTX");
+    AT_CUDA_NVRTC_CHECK(
+        at::globalContext().getNVRTC().nvrtcGetPTXSize(program, &ptx_size));
+    ptx.resize(ptx_size);
+    AT_CUDA_NVRTC_CHECK(
+        at::globalContext().getNVRTC().nvrtcGetPTX(program, ptx.data()));
+  }
 
   NvrtcFunction compiled_kernel_;
 
@@ -303,6 +347,8 @@ NvrtcFunction nvrtcCompile(
   // has an impact on generated binary.
   const char* prefix_env = getenv("PYTORCH_CUDA_FUSER_CUBIN");
   if (prefix_env) {
+    FUSER_PERF_SCOPE("load CUBIN");
+
     // Output ptx file
     std::stringstream ptx_file_name;
     ptx_file_name << prefix_env << "_" << id << ".ptx";
@@ -323,9 +369,9 @@ NvrtcFunction nvrtcCompile(
         ptx.data(),
         ptx_size,
         "compiling PTX",
-        0,
-        nullptr,
-        nullptr));
+        options.size(),
+        options.data(),
+        option_vals.data()));
 
     size_t cubinSize;
     void* cubin;
@@ -347,9 +393,15 @@ NvrtcFunction nvrtcCompile(
     AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
         &(compiled_kernel_.module), cubin));
   } else {
+    FUSER_PERF_SCOPE("load PTX");
+
     // load ptx directly
-    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
-        &(compiled_kernel_.module), ptx.data()));
+    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx(
+        &(compiled_kernel_.module),
+        ptx.data(),
+        options.size(),
+        options.data(),
+        option_vals.data()));
   }
   AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleGetFunction(
       &(compiled_kernel_.function),
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index d7f50ff7813b..b306cf04da0a 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -1,16 +1,18 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
-#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
 
+#include <cuda.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 namespace torch {
 namespace jit {
@@ -24,23 +26,17 @@ std::string kernelPreamble();
 void validateKernelInputs(
     Fusion* fusion,
     const at::ArrayRef<IValue>& inputs,
-    c10::Device device);
+    const c10::Device& device);
 
 void validateKernelOutputs(
     Fusion* fusion,
     const std::vector<at::Tensor>& outputs,
-    c10::Device device);
-
-// Check if a value is already bound, if so validate we're trying to bind to the
-// same value
-void safeBind(
-    EvaluationContext& ec,
-    const Val* value,
-    Int::ScalarType concrete_value);
+    const c10::Device& device);
 
-EvaluationContext bindInputs(
+StatefulExpressionEvaluator statefulBindInputs(
     const at::ArrayRef<IValue>& aten_inputs,
-    Fusion* fusion);
+    Fusion* fusion,
+    GpuLower* lower = nullptr);
 
 struct NvrtcFunction {
   CUmodule module = CUmodule();
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
index b82813748a0b..17fb81ceaf6a 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -1,6 +1,6 @@
-
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 
@@ -10,41 +10,61 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-void EvaluationContext::bind(const Val* value, Int::ScalarType concrete_value) {
-  TORCH_INTERNAL_ASSERT(
-      value->isAnInt(),
-      "Expressoin Evaluation does not support values other than integers at this time.");
+void StatefulExpressionEvaluator::safeBind(
+    Val* value,
+    Int::ScalarType concrete_value,
+    GpuLower* lower) {
+  auto already_concrete_val = getValue(value);
 
-  if (value->isConstScalar()) {
-    auto const_value = value->as<Int>()->value().value();
+  if (already_concrete_val.has_value()) {
     TORCH_INTERNAL_ASSERT(
-        concrete_value == const_value,
+        concrete_value == already_concrete_val.value(),
         "Tried to bind ",
-        concrete_value,
-        " to ",
         value,
-        " however ",
-        value,
-        " is set to a constant ",
-        const_value);
-  }
+        " to ",
+        " concrete value, but it's already set to ",
+        already_concrete_val.value());
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        value->getOrigin() == nullptr,
+        "Tried to bind to a value that is computed in the fusion IR. ",
+        "Can only bind to symbolic values to the fusion that do not have an origin expr.");
 
-  TORCH_INTERNAL_ASSERT(
-      fusion_->origin(value) == nullptr,
-      "Tried to bind to a value that is computed in the fusion IR. ",
-      "Can only bind to symbolic values to the fusion that do not have an origin expr.");
+    bindings_[value] = concrete_value;
+  }
 
-  bindings_[value] = concrete_value;
+  if (lower != nullptr) {
+    // TODO(kir): we should not need to lower (or mutate the IR in any way)
+    //  during expression evaluation
+    auto lowered_val = lower->getLowerValue(value);
+    already_concrete_val = getValue(lowered_val);
+
+    if (already_concrete_val.has_value()) {
+      TORCH_INTERNAL_ASSERT(
+          concrete_value == already_concrete_val.value(),
+          "Tried to bind ",
+          lowered_val,
+          " to ",
+          " concrete value, but it's already set to ",
+          already_concrete_val.value());
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          lowered_val->getOrigin() == nullptr,
+          "Tried to bind to a value that is computed in the fusion IR. ",
+          "Can only bind to symbolic values to the fusion that do not have an origin expr.");
+
+      bindings_[lowered_val] = concrete_value;
+    }
+  }
 }
 
-c10::optional<Int::ScalarType> EvaluationContext::concreteValue(
-    const Val* value) const {
-  const auto it = bindings_.find(value);
-  return (it != bindings_.end()) ? c10::optional<Int::ScalarType>(it->second)
-                                 : c10::nullopt;
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::inferValue(
+    Val* value) {
+  FUSER_PERF_SCOPE("inferValue");
+  return maybeHandle(value);
 }
 
-void EvaluationContext::print() const {
+void StatefulExpressionEvaluator::print() const {
   std::cout << "\nEvaluation context\n";
   std::cout << "--------------------\n";
   for (const auto& kv : bindings_) {
@@ -53,61 +73,59 @@ void EvaluationContext::print() const {
       std::cout << " ; original value = "
                 << kv.first->as<Int>()->value().value();
     }
-    std::cout << "\n";
+    std::cout << " ; " << *kv.first->getValType() << "\n";
   }
   std::cout << "--------------------\n\n";
 }
 
-c10::optional<Int::ScalarType> ExpressionEvaluator::evaluate(
-    Val* val,
-    const EvaluationContext* context) {
-  TORCH_CHECK(context != nullptr);
-  ExpressionEvaluator evaluator(context);
-  evaluator.traverseFrom(context->fusion(), {val}, false);
-  return evaluator.value(val);
-}
-
-c10::optional<Int::ScalarType> ExpressionEvaluator::value(
-    const Statement* stmt) const {
-  const auto it = values_.find(stmt);
-  return (it != values_.end()) ? c10::optional<Int::ScalarType>(it->second)
-                               : c10::nullopt;
-}
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::getValue(
+    Val* value) {
+  TORCH_INTERNAL_ASSERT(
+      value->isAnInt(),
+      "Expressoin Evaluation does not support values other than integers at this time.");
 
-void ExpressionEvaluator::handle(NamedScalar* i) {
-  if (i->isAnInt()) {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
+  switch (value->getValType().value()) {
+    case ValType::Scalar:
+      if (value->as<Int>()->value().has_value()) {
+        return value->as<Int>()->value();
+      }
+      break;
+    case ValType::KirScalar:
+      if (value->as<kir::Int>()->value().has_value()) {
+        return value->as<kir::Int>()->value();
+      }
+      break;
+    default:
+      break;
   }
+
+  const auto it = bindings_.find(value);
+  return it != bindings_.end() ? c10::optional<Int::ScalarType>(it->second)
+                               : c10::nullopt;
 }
 
-void ExpressionEvaluator::handle(Int* i) {
-  if (i->value().has_value()) {
-    values_[i] = *i->value();
-  } else if (const auto* def = context_->fusion()->origin(i)) {
-    const auto& def_result = value(def);
-    if (def_result.has_value()) {
-      values_[i] = *def_result;
-    }
-  } else {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::maybeHandle(
+    Val* val) {
+  auto maybe_concrete_value = getValue(val);
+  if (!maybe_concrete_value.has_value()) {
+    auto origin = val->getOrigin();
+    if (origin != nullptr) {
+      handle(origin);
+      maybe_concrete_value = getValue(val);
     }
   }
+  return maybe_concrete_value;
 }
 
-void ExpressionEvaluator::handle(UnaryOp* uop) {
-  const auto in = value(uop->in());
+void StatefulExpressionEvaluator::handle(UnaryOp* uop) {
+  const auto in = maybeHandle(uop->in());
   if (in.has_value()) {
     switch (uop->getUnaryOpType()) {
       case UnaryOpType::Neg:
-        values_[uop] = -*in;
+        bindings_[uop->out()] = -*in;
         break;
       case UnaryOpType::Cast:
-        values_[uop] = *in;
+        bindings_[uop->out()] = *in;
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -115,34 +133,34 @@ void ExpressionEvaluator::handle(UnaryOp* uop) {
   }
 }
 
-void ExpressionEvaluator::handle(BinaryOp* bop) {
-  const auto lhs = value(bop->lhs());
-  const auto rhs = value(bop->rhs());
+void StatefulExpressionEvaluator::handle(BinaryOp* bop) {
+  const auto lhs = maybeHandle(bop->lhs());
+  const auto rhs = maybeHandle(bop->rhs());
   if (lhs.has_value() && rhs.has_value()) {
     switch (bop->getBinaryOpType()) {
       case BinaryOpType::Add:
-        values_[bop] = *lhs + *rhs;
+        bindings_[bop->out()] = *lhs + *rhs;
         break;
       case BinaryOpType::Sub:
-        values_[bop] = *lhs - *rhs;
+        bindings_[bop->out()] = *lhs - *rhs;
         break;
       case BinaryOpType::Mul:
-        values_[bop] = *lhs * *rhs;
+        bindings_[bop->out()] = *lhs * *rhs;
         break;
       case BinaryOpType::Div:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs / *rhs;
+        bindings_[bop->out()] = *lhs / *rhs;
         break;
       case BinaryOpType::Mod:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs % *rhs;
+        bindings_[bop->out()] = *lhs % *rhs;
         break;
       case BinaryOpType::CeilDiv:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = (*lhs + *rhs - 1) / *rhs;
+        bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs;
         break;
       case BinaryOpType::And:
-        values_[bop] = Int::ScalarType(*lhs && *rhs);
+        bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs);
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -150,40 +168,15 @@ void ExpressionEvaluator::handle(BinaryOp* bop) {
   }
 }
 
-void ExpressionEvaluator::handle(kir::NamedScalar* i) {
-  if (i->isAnInt()) {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
-  }
-}
-
-void ExpressionEvaluator::handle(kir::Int* i) {
-  if (i->value().has_value()) {
-    values_[i] = *i->value();
-  } else if (const auto* def = context_->fusion()->origin(i)) {
-    const auto& def_result = value(def);
-    if (def_result.has_value()) {
-      values_[i] = *def_result;
-    }
-  } else {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
-  }
-}
-
-void ExpressionEvaluator::handle(kir::UnaryOp* uop) {
-  const auto in = value(uop->in());
+void StatefulExpressionEvaluator::handle(kir::UnaryOp* uop) {
+  const auto in = maybeHandle(uop->in());
   if (in.has_value()) {
     switch (uop->getUnaryOpType()) {
       case UnaryOpType::Neg:
-        values_[uop] = -*in;
+        bindings_[uop->out()] = -*in;
         break;
       case UnaryOpType::Cast:
-        values_[uop] = *in;
+        bindings_[uop->out()] = *in;
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -191,34 +184,34 @@ void ExpressionEvaluator::handle(kir::UnaryOp* uop) {
   }
 }
 
-void ExpressionEvaluator::handle(kir::BinaryOp* bop) {
-  const auto lhs = value(bop->lhs());
-  const auto rhs = value(bop->rhs());
+void StatefulExpressionEvaluator::handle(kir::BinaryOp* bop) {
+  const auto lhs = maybeHandle(bop->lhs());
+  const auto rhs = maybeHandle(bop->rhs());
   if (lhs.has_value() && rhs.has_value()) {
     switch (bop->getBinaryOpType()) {
       case BinaryOpType::Add:
-        values_[bop] = *lhs + *rhs;
+        bindings_[bop->out()] = *lhs + *rhs;
         break;
       case BinaryOpType::Sub:
-        values_[bop] = *lhs - *rhs;
+        bindings_[bop->out()] = *lhs - *rhs;
         break;
       case BinaryOpType::Mul:
-        values_[bop] = *lhs * *rhs;
+        bindings_[bop->out()] = *lhs * *rhs;
         break;
       case BinaryOpType::Div:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs / *rhs;
+        bindings_[bop->out()] = *lhs / *rhs;
         break;
       case BinaryOpType::Mod:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs % *rhs;
+        bindings_[bop->out()] = *lhs % *rhs;
         break;
       case BinaryOpType::CeilDiv:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = (*lhs + *rhs - 1) / *rhs;
+        bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs;
         break;
       case BinaryOpType::And:
-        values_[bop] = Int::ScalarType(*lhs && *rhs);
+        bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs);
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
index 1e107ff129b2..40ba53380fae 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 #include <c10/util/Optional.h>
 
@@ -13,68 +14,67 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-// Encapsulates a set of value bindings on top of a Fusion IR
-// (used to provide known values to ExpressionEvaluator)
-//
-// NOTE: currently it only supports Int values
-//
-class TORCH_CUDA_API EvaluationContext {
+class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch {
  public:
-  explicit EvaluationContext(Fusion* fusion) : fusion_(fusion) {}
-
-  // Set the concrete value for a Int*
-  void bind(const Val* value, Int::ScalarType concrete_value);
-
-  // Retrieves the concrete value, or nullopt if not set
-  c10::optional<Int::ScalarType> concreteValue(const Val* value) const;
+  explicit StatefulExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}
 
   Fusion* fusion() const {
     return fusion_;
   }
 
-  // Debugging helper, prints all the currently set values
-  void print() const;
+  void safeBind(
+      Val* value,
+      Int::ScalarType concrete_value,
+      GpuLower* lower = nullptr);
 
- private:
-  std::unordered_map<const Val*, Int::ScalarType> bindings_;
-  Fusion* fusion_ = nullptr;
-};
+  // Returns value if found in mapping, otherwise returns c10::nullopt
+  c10::optional<Int::ScalarType> getValue(Val* value);
 
-// Evaluates expressions in a Fusion IR, using the passed in
-// context (EvaluationContext) to query for concrete_values. The
-// evaluation context may override concrete values in the IR as well.
-class TORCH_CUDA_API ExpressionEvaluator : private IterVisitor {
- public:
-  // Returns the result of the specified expression, or nullopt if
-  // the result cannot be evaluated
-  static c10::optional<Int::ScalarType> evaluate(
-      Val* val,
-      const EvaluationContext* context);
+  // Checks if value is already infered, returns infered value if so, otherwise
+  // runs traversal on value. Warning: should not be called in traversal.
+  c10::optional<Int::ScalarType> inferValue(Val* value);
 
- private:
-  explicit ExpressionEvaluator(const EvaluationContext* context)
-      : context_(context) {}
-
-  ~ExpressionEvaluator() override = default;
-
-  c10::optional<Int::ScalarType> value(const Statement* stmt) const;
+  // Debugging helper, prints all the currently set values
+  void print() const;
 
-  using IterVisitor::handle;
+ private:
+  using OptOutDispatch::handle;
+
+  void handle(Expr* expr) override {
+    switch (expr->getExprType().value()) {
+      case ExprType::UnaryOp:
+        handle(expr->as<UnaryOp>());
+        break;
+      case ExprType::BinaryOp:
+        handle(expr->as<BinaryOp>());
+        break;
+      case ExprType::KirUnaryOp:
+        handle(expr->as<kir::UnaryOp>());
+        break;
+      case ExprType::KirBinaryOp:
+        handle(expr->as<kir::BinaryOp>());
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Cannot handle Expr type: ",
+            expr->getExprType().value(),
+            " in stateful expression evaluator.");
+    }
+  }
 
-  void handle(NamedScalar*) override;
-  void handle(Int*) override;
   void handle(UnaryOp*) override;
   void handle(BinaryOp*) override;
 
   // TODO(kir): remove this
-  void handle(kir::NamedScalar*) override;
-  void handle(kir::Int*) override;
   void handle(kir::UnaryOp*) override;
   void handle(kir::BinaryOp*) override;
 
+  c10::optional<Int::ScalarType> maybeHandle(Val*);
+
  private:
-  const EvaluationContext* context_ = nullptr;
-  std::unordered_map<const Statement*, Int::ScalarType> values_;
+  std::unordered_map<const Val*, Int::ScalarType> bindings_;
+  Fusion* fusion_ = nullptr;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp
index 2f6f06c6359c..fcb12a978d2a 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion.cpp
@@ -1,11 +1,17 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
+// TODO(kir): only needed until we can fix Fusion::origin()
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -25,36 +31,9 @@ Fusion* FusionGuard::getCurFusion() {
   return ACTIVE_FUSION;
 }
 
-void ExprSort::handle(Expr* expr) {
-  exprs.push_back(expr);
-}
-
-std::vector<Expr*> ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) {
-  ExprSort es;
-  es.traverse(fusion, from_outputs_only);
-  return es.exprs;
-}
-
-std::vector<Expr*> ExprSort::getExprs(
-    Fusion* fusion,
-    const std::vector<Val*>& from) {
-  ExprSort es;
-  es.traverseFrom(fusion, from, false);
-  return es.exprs;
-}
-
-void InputsOf::handle(Val* v) {
-  if (FusionGuard::getCurFusion()->origin(v) == nullptr)
-    inputs.emplace(v);
-}
-
-std::unordered_set<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
-  InputsOf io;
-  io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false);
-  return io.inputs;
-}
-
 void swap(Fusion& a, Fusion& b) noexcept {
+  FUSER_PERF_SCOPE("Fusion swap");
+
   using std::swap;
 
   // Swap the content
@@ -90,6 +69,7 @@ void swap(Fusion& a, Fusion& b) noexcept {
   // Lowered IR nodes
   swap(a.lowered_val_set_, b.lowered_val_set_);
   swap(a.lowered_expr_set_, b.lowered_expr_set_);
+  swap(a.lowered_origin_, b.lowered_origin_);
 
   for (auto val : a.lowered_val_set_) {
     val->fusion_ = &a;
@@ -106,6 +86,8 @@ void swap(Fusion& a, Fusion& b) noexcept {
 }
 
 Fusion::Fusion(const Fusion& other) {
+  FUSER_PERF_SCOPE("Fusion copy");
+
   IrCloner ir_cloner(this);
 
   for (auto val : other.val_set_) {
@@ -140,22 +122,15 @@ Fusion::Fusion(const Fusion& other) {
 
   inputs_ = ir_cloner.clone(other.inputs_);
   outputs_ = ir_cloner.clone(other.outputs_);
-
-  // Lowered nodes
-  for (auto val : other.lowered_val_set_) {
-    lowered_val_set_.insert(ir_cloner.clone(val));
-  }
-
-  for (auto expr : other.lowered_expr_set_) {
-    lowered_expr_set_.insert(ir_cloner.clone(expr));
-  }
 }
 
 Fusion::Fusion(Fusion&& other) noexcept {
+  FUSER_PERF_SCOPE("Fusion move");
   swap(*this, other);
 }
 
 Fusion& Fusion::operator=(const Fusion& other) {
+  FUSER_PERF_SCOPE("Fusion copy assign");
   Fusion copy(other);
   clear();
   swap(*this, copy);
@@ -163,6 +138,7 @@ Fusion& Fusion::operator=(const Fusion& other) {
 }
 
 Fusion& Fusion::operator=(Fusion&& other) noexcept {
+  FUSER_PERF_SCOPE("Fusion move assign");
   clear();
   swap(*this, other);
   return *this;
@@ -173,6 +149,8 @@ Fusion::~Fusion() {
 }
 
 void Fusion::clear() noexcept {
+  FUSER_PERF_SCOPE("Fusion clear");
+
   // Free the owned values
   for (auto ptr : val_set_) {
     delete ptr;
@@ -208,6 +186,7 @@ void Fusion::clear() noexcept {
   }
   lowered_val_set_.clear();
   lowered_expr_set_.clear();
+  lowered_origin_.clear();
 }
 
 void Fusion::removeExpr(Expr* expr) {
@@ -263,39 +242,34 @@ void Fusion::removeVal(Val* val) {
   delete val;
 }
 
-void Fusion::addInput(Val* const input) {
+void Fusion::addInput(Val* input) {
   assertInFusion(input, "Cannot register input ");
 
   if (input->getValType().value() == ValType::TensorView) {
     auto tv = input->as<TensorView>();
-    if (tv->hasReduction())
+    if (tv->hasReduction()) {
       TORCH_WARN_ONCE(
           "Registered input ",
           input,
           " has a reduction axis, but this does nothing in the fusion.");
+    }
+    tv->setMemoryType(MemoryType::Global);
   }
 
-  TORCH_CHECK(
+  TORCH_INTERNAL_ASSERT(
       input->getOrigin() == nullptr,
       input,
       " cannot be registered as an input as it is used as an output of an expression (",
       input->getOrigin(),
       ").");
-
   inputs_.push_back(input);
 }
 
-void Fusion::addOutput(Val* const output) {
+void Fusion::addOutput(Val* output) {
   assertInFusion(output, "Cannot register output ");
   if (output->getValType().value() == ValType::TensorView) {
     auto tv = output->as<TensorView>();
-    if (TensorDomain::hasBroadcast(tv->getRootDomain()))
-      // Go to the root as we can merge bcast and
-      // non-bcast dims, making a non-bcast dim.
-      TORCH_CHECK( // Should we warn instead?
-          false,
-          output,
-          " cannot be registered as an output as it has a broadcast axis.");
+    tv->setMemoryType(MemoryType::Global);
   }
   outputs_.push_back(output);
 }
@@ -367,29 +341,35 @@ void Fusion::validateInputs() {
 }
 
 void Fusion::print() {
+  FUSER_PERF_SCOPE("Fusion::print");
+
   FusionGuard fg(this);
   std::cout << "%kernel {\n";
-  IRMathPrinter op_exprs(std::cout);
+  IrMathPrinter op_exprs(std::cout);
   op_exprs.handle(this);
-  IRTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(std::cout);
   t_exprs.handle(this);
   std::cout << "}\n";
 }
 
 void Fusion::printKernel() {
-  GpuLower lower(this);
-  lower.printKernel(std::cout);
+  FUSER_PERF_SCOPE("Fusion::printKernel");
+  std::cout << codegen::generateCudaKernel(GpuLower(this).kernel());
 }
 
 void Fusion::printMath() {
+  FUSER_PERF_SCOPE("Fusion::printMath");
+
   FusionGuard fg(this);
   for (auto expr : exprs(true))
     std::cout << expr;
 }
 
 void Fusion::printTransforms() {
+  FUSER_PERF_SCOPE("Fusion::printTransforms");
+
   FusionGuard fg(this);
-  IRTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(std::cout);
   t_exprs.handle(this);
 }
 
@@ -478,13 +458,11 @@ StmtNameType Fusion::registerLoweredExpr(Expr* expr) {
 
   for (Val* input : expr->inputs()) {
     TORCH_CHECK(inKernelIr(input));
-    assertInFusion(input);
   }
 
   for (Val* output : expr->outputs()) {
     TORCH_CHECK(inKernelIr(output));
-    assertInFusion(output);
-    TORCH_CHECK(origin_.insert({output, expr}).second);
+    TORCH_CHECK(lowered_origin_.insert({output, expr}).second);
   }
 
   lowered_expr_set_.insert(expr);
@@ -518,20 +496,17 @@ std::unordered_set<Expr*> Fusion::unordered_uses(Val* val) const {
   return std::unordered_set<Expr*>();
 }
 
-Expr* Fusion::origin(Val* val) const {
-  assertInFusion(val, "Cannot detect the origin of val, ");
-  auto it = origin_.find(val);
-  if (it == origin_.end())
-    return nullptr;
-  return it->second;
-}
-
-const Expr* Fusion::origin(const Val* val) const {
-  assertInFusion(val, "Cannot dettect the origin of val, ");
-  auto it = origin_.find(const_cast<Val*>(val)); // NOLINT
-  if (it == origin_.end())
-    return nullptr;
-  return it->second;
+Expr* Fusion::origin(const Val* val) const {
+  // TODO(kir): remove the lowered branch
+  if (kir::isLoweredVal(val)) {
+    TORCH_INTERNAL_ASSERT(inKernelIr(val));
+    auto it = lowered_origin_.find(val);
+    return it != lowered_origin_.end() ? it->second : nullptr;
+  } else {
+    assertInFusion(val, "Cannot detect the origin of val, ");
+    auto it = origin_.find(val);
+    return it != origin_.end() ? it->second : nullptr;
+  }
 }
 
 bool Fusion::hasInput(const Val* val) const {
@@ -559,7 +534,7 @@ StmtNameType Fusion::getExprName() {
 }
 
 // Indicate to kernel to set itself up to generate random numbers
-bool Fusion::hasRNG() {
+bool Fusion::isStochastic() {
   for (auto expr : exprs(true))
     if (expr->getExprType() == ExprType::UnaryOp)
       if (expr->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::RandLike)
@@ -567,8 +542,9 @@ bool Fusion::hasRNG() {
   return false;
 }
 
-// Indicate to kernel to set itself up to generate random numbers
 bool Fusion::hasReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -579,6 +555,8 @@ bool Fusion::hasReduction() {
 }
 
 bool Fusion::hasBlockReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasBlockReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -589,6 +567,8 @@ bool Fusion::hasBlockReduction() {
 }
 
 bool Fusion::hasGridReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasGridReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -598,7 +578,32 @@ bool Fusion::hasGridReduction() {
   return false;
 }
 
+bool Fusion::hasBlockBroadcast() {
+  for (auto expr : exprs(true)) {
+    for (auto out : expr->outputs()) {
+      if (out->getValType() == ValType::TensorView) {
+        if (out->as<TensorView>()->hasBlockBroadcast()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool Fusion::hasBroadcast() {
+  for (auto expr : exprs(true))
+    for (auto out : expr->outputs())
+      if (out->getValType() == ValType::TensorView)
+        if (out->as<TensorView>()->hasBroadcast())
+          return true;
+
+  return false;
+}
+
 std::vector<Val*> Fusion::getTerminatingOutputs() {
+  FUSER_PERF_SCOPE("getTerminatingOutputs");
+
   FusionGuard fg(this);
 
   std::unordered_set<Val*> used_vals;
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
index 4d0d50b78dc9..99c97cc91943 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.h
+++ b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -4,7 +4,6 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
 #include <unordered_map>
 #include <unordered_set>
@@ -14,14 +13,6 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
-struct TypeHash {
-  template <typename T>
-  std::size_t operator()(T t) const {
-    return static_cast<std::size_t>(t);
-  }
-};
-
 /*
  * Usage: FusionGuard and Fusion are required user interfaces for any operation
  * underlying the code generator. In order to create values, expressions, and
@@ -65,32 +56,6 @@ class TORCH_CUDA_API FusionGuard {
   static Fusion* getCurFusion();
 };
 
-// Expr sort will take a fusion and return a topologically sorted list of
-// expressions.
-class ExprSort : public IterVisitor {
- private:
-  std::vector<Expr*> exprs;
-
-  void handle(Expr* expr) override;
-
- public:
-  static std::vector<Expr*> getExprs(Fusion* fusion, bool from_outputs_only);
-
-  static std::vector<Expr*> getExprs(
-      Fusion* fusion,
-      const std::vector<Val*>& from);
-};
-
-class InputsOf : public IterVisitor {
- private:
-  std::unordered_set<Val*> inputs;
-
-  void handle(Val* v) final;
-
- public:
-  static std::unordered_set<Val*> output(Fusion* fusion, Val* output_);
-};
-
 /*
  * Fusion is mutable but unique. Nodes cannot be copied in any way from one
  * Fusion to another. If anything like that is desired, it would require
@@ -125,10 +90,10 @@ class TORCH_CUDA_API Fusion final {
   void removeVal(Val* val);
 
   // Register input as an input of the fusion
-  void addInput(Val* const input);
+  void addInput(Val* input);
 
   // Register output as an output of the fusion
-  void addOutput(Val* const output);
+  void addOutput(Val* output);
 
   // Check if stmt is properly registered with this fusion
   bool inFusion(const Statement* stmt) const;
@@ -162,8 +127,10 @@ class TORCH_CUDA_API Fusion final {
 
   // Print transformations used in fusion (can be very verbose)
   void printTransforms();
+
   // Lower the fusion and print a kernel
   void printKernel();
+
   // Register the Val with this fusion
   StmtNameType registerVal(Val* val);
 
@@ -177,10 +144,12 @@ class TORCH_CUDA_API Fusion final {
   StmtNameType registerStatement(Statement* stmt);
 
   // Lowered nodes
+  // TODO(kir): to be removed
   StmtNameType registerLoweredVal(Val* val);
   StmtNameType registerLoweredExpr(Expr* expr);
 
   // Lowered counterpart to inFusion()
+  // TODO(kir): to be removed
   bool inKernelIr(const Statement* stmt) const;
 
   // Check if val is used in this fusion. Not equivelent to DCE
@@ -198,17 +167,17 @@ class TORCH_CUDA_API Fusion final {
   std::unordered_set<Expr*> unordered_uses(Val* val) const;
 
   // Return the Expr that produces val
-  Expr* origin(Val* val) const;
-
-  // Return the Expr that produces val (const version)
-  const Expr* origin(const Val* val) const;
+  Expr* origin(const Val* val) const;
 
   // Indicate to kernel to set itself up to generate random numbers
-  bool hasRNG();
+  bool isStochastic();
 
+  // TODO(kir): revisit to see how many of these are still needed
   bool hasReduction();
   bool hasBlockReduction();
   bool hasGridReduction();
+  bool hasBlockBroadcast();
+  bool hasBroadcast();
   size_t gridReductionTempBufferSize();
 
   const auto& inputs() const {
@@ -247,7 +216,7 @@ class TORCH_CUDA_API Fusion final {
   StmtNameType expr_name_counter_ = 0;
 
   // Dependency tracking for Vals. Where did it come from? Where is it used?
-  std::unordered_map<Val*, Expr*> origin_;
+  std::unordered_map<const Val*, Expr*> origin_;
   std::unordered_map<Val*, std::unordered_set<Expr*>> uses_;
 
   // Fusion inputs and outputs
@@ -257,6 +226,7 @@ class TORCH_CUDA_API Fusion final {
   // Lowered IR
   std::unordered_set<Val*> lowered_val_set_;
   std::unordered_set<Expr*> lowered_expr_set_;
+  std::unordered_map<const Val*, Expr*> lowered_origin_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 2356314c66db..1dfdc7b1edcd 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 
 #include <c10/util/Exception.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/cuda/partition.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
@@ -847,6 +848,8 @@ struct CudaGraphFuser {
 };
 
 void compileFusionRecursive(Block* block) {
+  FUSER_PERF_SCOPE("compileFusionRecursive");
+
   for (auto node : block->nodes()) {
     if (node->kind() == prim::CudaFusionGroup) {
       fuser::cuda::compileFusionGroup(node);
@@ -858,6 +861,8 @@ void compileFusionRecursive(Block* block) {
 }
 
 void PeepholeOptimizeShapeExpressions(Block* block) {
+  FUSER_PERF_SCOPE("PeepholeOptimizeShapeExpressions");
+
   auto nodes = block->nodes();
   for (auto it = nodes.begin(); it != nodes.end(); ++it) {
     Node* node = *it;
@@ -912,6 +917,8 @@ void PeepholeOptimizeShapeExpressions(Block* block) {
 } // anonymous namespace
 
 void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("CudaFuseGraph");
+
   CudaGraphFuser(graph->block(), graph).run();
   // After FuseGraph some common subexpressions may come back
   EliminateCommonSubexpression(graph);
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index ba5976dc53c8..9b757661e12d 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -1,8 +1,13 @@
+
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
@@ -53,8 +58,8 @@ class ContigIDs : public OptInDispatch {
     // If either input is non-contiguous so is output.
     auto inner = merge->inner();
     auto outer = merge->outer();
-    if (!isContig(kir::lowerValue(inner)->as<kir::IterDomain>()) ||
-        !isContig(kir::lowerValue(outer)->as<kir::IterDomain>())) {
+    if (!isContig(GpuLower::lowerValue(inner)->as<kir::IterDomain>()) ||
+        !isContig(GpuLower::lowerValue(outer)->as<kir::IterDomain>())) {
       return;
     }
 
@@ -117,9 +122,11 @@ class ContigIDs : public OptInDispatch {
     // If we matched all inputs, the output is contiguous. Only want to keep the
     // top contig ID, lower ids should be placed in the "within_contig_ids" map
     // of top id.
-    auto kir_inner = kir::lowerValue(merge->inner())->as<kir::IterDomain>();
-    auto kir_outer = kir::lowerValue(merge->outer())->as<kir::IterDomain>();
-    auto kir_out = kir::lowerValue(merge->out())->as<kir::IterDomain>();
+    auto kir_inner =
+        GpuLower::lowerValue(merge->inner())->as<kir::IterDomain>();
+    auto kir_outer =
+        GpuLower::lowerValue(merge->outer())->as<kir::IterDomain>();
+    auto kir_out = GpuLower::lowerValue(merge->out())->as<kir::IterDomain>();
     if (ordered_inputs.empty()) {
       if (contig_ids.find(kir_inner) != contig_ids.end()) {
         contig_ids.erase(kir_inner);
@@ -175,7 +182,7 @@ class ContigIDs : public OptInDispatch {
     for (size_t i = 0; i < root_domain_.size(); i++) {
       if (root_contiguity_[i]) {
         auto kir_root_domain_i =
-            kir::lowerValue(root_domain_[i])->as<kir::IterDomain>();
+            GpuLower::lowerValue(root_domain_[i])->as<kir::IterDomain>();
         contig_ids.emplace(kir_root_domain_i);
         within_contig_ids[kir_root_domain_i] =
             std::unordered_set<kir::IterDomain*>();
@@ -204,9 +211,9 @@ class ContigIDs : public OptInDispatch {
 } // namespace
 
 void IndexCompute::handle(Split* split) {
-  auto in_id = kir::lowerValue(split->in())->as<kir::IterDomain>();
-  auto outer_id = kir::lowerValue(split->outer())->as<kir::IterDomain>();
-  auto inner_id = kir::lowerValue(split->inner())->as<kir::IterDomain>();
+  auto in_id = GpuLower::lowerValue(split->in())->as<kir::IterDomain>();
+  auto outer_id = GpuLower::lowerValue(split->outer())->as<kir::IterDomain>();
+  auto inner_id = GpuLower::lowerValue(split->inner())->as<kir::IterDomain>();
 
   auto outer_it = index_map_.find(outer_id);
   auto inner_it = index_map_.find(inner_id);
@@ -236,8 +243,11 @@ void IndexCompute::handle(Split* split) {
     }
   }
 
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (outer_zero && inner_zero) {
-    index_map_[in_id] = new kir::Int(0);
+    index_map_[in_id] = ir_builder.create<kir::Int>(0);
+    extent_map_[in_id] = ir_builder.create<kir::Int>(0);
   } else if (outer_zero) {
     index_map_[in_id] = inner_ind;
     zero_merged_in_.emplace(in_id);
@@ -247,15 +257,20 @@ void IndexCompute::handle(Split* split) {
     zero_merged_in_.emplace(in_id);
     extent_map_[in_id] = getExtent(outer_id);
   } else {
-    index_map_[in_id] =
-        kir::addExpr(kir::mulExpr(outer_ind, getExtent(inner_id)), inner_ind);
+    index_map_[in_id] = ir_builder.addExpr(
+        ir_builder.mulExpr(outer_ind, getExtent(inner_id)), inner_ind);
+    if (extent_map_.find(outer_id) != extent_map_.end() ||
+        extent_map_.find(inner_id) != extent_map_.end()) {
+      extent_map_[in_id] =
+          ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id));
+    }
   }
 }
 
 void IndexCompute::handle(Merge* merge) {
-  auto out_id = kir::lowerValue(merge->out())->as<kir::IterDomain>();
-  auto outer_id = kir::lowerValue(merge->outer())->as<kir::IterDomain>();
-  auto inner_id = kir::lowerValue(merge->inner())->as<kir::IterDomain>();
+  auto out_id = GpuLower::lowerValue(merge->out())->as<kir::IterDomain>();
+  auto outer_id = GpuLower::lowerValue(merge->outer())->as<kir::IterDomain>();
+  auto inner_id = GpuLower::lowerValue(merge->inner())->as<kir::IterDomain>();
 
   auto out_it = index_map_.find(out_id);
   if (out_it == index_map_.end())
@@ -263,7 +278,8 @@ void IndexCompute::handle(Merge* merge) {
 
   auto out_ind = out_it->second;
 
-  auto zero = new kir::Int(0);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  auto zero = ir_builder.create<kir::Int>(0);
 
   if (out_ind->isZeroInt()) {
     index_map_[outer_id] = zero;
@@ -281,11 +297,11 @@ void IndexCompute::handle(Merge* merge) {
     TORCH_INTERNAL_ASSERT(!input_ids.empty());
 
     for (auto root_id : input_ids) {
-      index_map_[kir::lowerValue(root_id)->as<kir::IterDomain>()] = zero;
+      index_map_[GpuLower::lowerValue(root_id)->as<kir::IterDomain>()] = zero;
     }
 
-    index_map_[kir::lowerValue(*(input_ids.end() - 1))->as<kir::IterDomain>()] =
-        out_ind;
+    index_map_[GpuLower::lowerValue(*(input_ids.end() - 1))
+                   ->as<kir::IterDomain>()] = out_ind;
     return;
   }
 
@@ -314,8 +330,8 @@ void IndexCompute::handle(Merge* merge) {
   } else {
     Val* I = inner_extent;
 
-    Val* outer_ind = kir::divExpr(out_ind, I);
-    Val* inner_ind = kir::modExpr(out_ind, I);
+    Val* outer_ind = ir_builder.divExpr(out_ind, I);
+    Val* inner_ind = ir_builder.modExpr(out_ind, I);
 
     index_map_[outer_id] = outer_ind;
     index_map_[inner_id] = inner_ind;
@@ -346,6 +362,8 @@ IndexCompute::IndexCompute(
       index_map_(std::move(initial_index_map)),
       extent_map_(std::move(_extent_map)),
       zero_merged_in_(std::move(_zero_merged_in)) {
+  FUSER_PERF_SCOPE("IndexCompute::IndexCompute");
+
   // Make sure we recompute any indices we can that map to a contiguous access
   // in physical memory.
   if (std::any_of(root_contiguity.begin(), root_contiguity.end(), [](bool b) {
@@ -389,6 +407,8 @@ IndexCompute IndexCompute::updateIndexCompute(
     const std::unordered_map<IterDomain*, IterDomain*>& id_map,
     std::unordered_map<kir::IterDomain*, Val*> new_index_entries,
     const std::vector<bool>& root_contiguity) {
+  FUSER_PERF_SCOPE("updateIndexCompute");
+
   std::unordered_map<kir::IterDomain*, Val*> updated_index_map =
       std::move(new_index_entries);
   std::unordered_map<kir::IterDomain*, Val*> updated_extent_map;
@@ -396,9 +416,9 @@ IndexCompute IndexCompute::updateIndexCompute(
 
   for (auto id_entry : id_map) {
     kir::IterDomain* prev_id =
-        kir::lowerValue(id_entry.first)->as<kir::IterDomain>();
+        GpuLower::lowerValue(id_entry.first)->as<kir::IterDomain>();
     kir::IterDomain* new_id =
-        kir::lowerValue(id_entry.second)->as<kir::IterDomain>();
+        GpuLower::lowerValue(id_entry.second)->as<kir::IterDomain>();
 
     if (index_map_.find(prev_id) != index_map_.end()) {
       updated_index_map[new_id] = index_map_.at(prev_id);
@@ -443,6 +463,8 @@ std::vector<bool> IndexCompute::contiguityAnd(
 std::vector<bool> IndexCompute::contiguityPasC(
     TensorDomain* producer,
     TensorDomain* consumer) {
+  FUSER_PERF_SCOPE("contiguityPasC");
+
   const std::vector<bool>& producer_contiguity = producer->contiguity();
   std::vector<bool> as_consumer_contiguity;
 
@@ -578,7 +600,7 @@ generateIndexAndExtentMap(
 
   std::transform(
       td.begin(), td.end(), std::back_inserter(kir_td), [](IterDomain* id) {
-        return kir::lowerValue(id)->as<kir::IterDomain>();
+        return GpuLower::lowerValue(id)->as<kir::IterDomain>();
       });
 
   // Map from all IterDomain's to corresponding index as we process each tv in
@@ -617,7 +639,7 @@ generateIndexAndExtentMap(
     kir_td.clear();
     std::transform(
         td.begin(), td.end(), std::back_inserter(kir_td), [](IterDomain* id) {
-          return kir::lowerValue(id)->as<kir::IterDomain>();
+          return GpuLower::lowerValue(id)->as<kir::IterDomain>();
         });
 
     // Match loops to this TV if the loop matchis this TV's ID (could reduce
@@ -668,7 +690,7 @@ generateIndexAndExtentMap(
     auto first_id_map = c2p_ID_maps.front();
     for (auto id_entry : first_id_map) {
       kir::IterDomain* this_id =
-          kir::lowerValue(id_entry.first)->as<kir::IterDomain>();
+          GpuLower::lowerValue(id_entry.first)->as<kir::IterDomain>();
       if (initial_extent_map.find(this_id) == initial_extent_map.end()) {
         initial_extent_map[this_id] = this_id->extent();
       }
@@ -705,7 +727,19 @@ generateIndexAndExtentMap(
 
   // PROPAGATE CONSUMER -> PRODUCER END
 
-  return std::make_pair(index_compute.indexMap(), index_compute.extentMap());
+  // Fill in extent map as some mapped indices may not have their extent filled
+  // in it, but consumers of this function expect it to be there
+
+  std::unordered_map<kir::IterDomain*, Val*> extent_map(
+      index_compute.extentMap());
+  for (auto ind_entry : index_compute.indexMap()) {
+    auto id = ind_entry.first;
+    if (extent_map.find(id) == extent_map.end()) {
+      extent_map[id] = id->extent();
+    }
+  }
+
+  return std::make_pair(index_compute.indexMap(), extent_map);
 }
 
 } // namespace
@@ -714,6 +748,10 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
     TensorView* producer_tv,
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("getGlobalProducerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // Replay producer to look like consumer so we can index on producer since our
   // loop nests look like consumer
   auto producerAsC = TransformReplay::replayPasC(
@@ -762,7 +800,8 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -783,15 +822,16 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
     } else {
       std::stringstream ss;
       ss << "T" << producer_tv->name() << ".stride[" << stride_i++ << "]";
-      strided_inds.push_back(kir::mulExpr(
-          root_ind, new kir::NamedScalar(ss.str(), DataType::Int)));
+      strided_inds.push_back(ir_builder.mulExpr(
+          root_ind,
+          ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int)));
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(producer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(producer_tv, strided_inds);
 }
 
 namespace {
@@ -807,7 +847,8 @@ std::unordered_map<kir::ForLoop*, Val*> indexMapFromTV(
     within_alloc = true;
   }
 
-  Val* zero = new kir::Int(0);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  Val* zero = ir_builder.create<kir::Int>(0);
 
   bool is_shared = tv->getMemoryType() == MemoryType::Shared;
   bool is_local = tv->getMemoryType() == MemoryType::Local;
@@ -831,6 +872,7 @@ std::unordered_map<kir::ForLoop*, Val*> indexMapFromTV(
   }
   return loop_to_ind_map;
 }
+
 } // namespace
 
 // Producer index for either shared or local memory
@@ -838,6 +880,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
     TensorView* producer_tv,
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // producer_tv->domain() is not replayed as the loop strucutre we were
   // provided, so replay it to match consumer_tv which is.
   auto producerAsC = TransformReplay::replayPasC(
@@ -874,7 +918,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -899,7 +944,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
         continue;
       }
 
-      auto kir_root_dom_j = kir::lowerValue(root_dom[j])->as<kir::IterDomain>();
+      auto kir_root_dom_j =
+          GpuLower::lowerValue(root_dom[j])->as<kir::IterDomain>();
 
       TORCH_INTERNAL_ASSERT(
           index_map.find(kir_root_dom_j) != index_map.end() &&
@@ -920,27 +966,31 @@ kir::TensorIndex* Index::getProducerIndex_impl(
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = kir::mulExpr(stride, root_ext_j);
+          stride = ir_builder.mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds.push_back(kir::mulExpr(root_ind_i, stride));
+      strided_inds.push_back(ir_builder.mulExpr(root_ind_i, stride));
     } else {
       strided_inds.push_back(root_ind_i);
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(producer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(producer_tv, strided_inds);
 }
 
 kir::TensorIndex* Index::getGlobalConsumerIndex(
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("getGlobalConsumerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // grab all tensor views from producer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
@@ -977,7 +1027,8 @@ kir::TensorIndex* Index::getGlobalConsumerIndex(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -996,22 +1047,24 @@ kir::TensorIndex* Index::getGlobalConsumerIndex(
     } else {
       std::stringstream ss;
       ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]";
-      strided_inds.push_back(
-          kir::mulExpr(ind, new kir::NamedScalar(ss.str(), DataType::Int)));
+      strided_inds.push_back(ir_builder.mulExpr(
+          ind, ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int)));
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(consumer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(consumer_tv, strided_inds);
 }
 
 // Consumer index for either shared or local memory
 kir::TensorIndex* Index::getConsumerIndex_impl(
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
-  // grab all tensor views from producer_tv <- computeAtRoot
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
+  // grab all tensor views from consumer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
   std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map =
@@ -1026,7 +1079,7 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
   auto index_map = index_and_extent_map.first;
   auto extent_map = index_and_extent_map.second;
 
-  // Indices should now be mapped onto IterDomains in producer, so just grab
+  // Indices should now be mapped onto IterDomains in consumer, so just grab
   // and use them.
   auto root_dom = consumer_tv->getMaybeRFactorDomain();
 
@@ -1036,7 +1089,8 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -1060,7 +1114,8 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
         continue;
       }
 
-      auto kir_root_dom_j = kir::lowerValue(root_dom[j])->as<kir::IterDomain>();
+      auto kir_root_dom_j =
+          GpuLower::lowerValue(root_dom[j])->as<kir::IterDomain>();
 
       TORCH_INTERNAL_ASSERT(
           index_map.find(kir_root_dom_j) != index_map.end() &&
@@ -1079,22 +1134,22 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = kir::mulExpr(stride, root_ext_j);
+          stride = ir_builder.mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds.push_back(kir::mulExpr(root_ind_i, stride));
+      strided_inds.push_back(ir_builder.mulExpr(root_ind_i, stride));
     } else {
       strided_inds.push_back(root_ind_i);
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(consumer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(consumer_tv, strided_inds);
 }
 
 // Producer is the inputs of an expression
@@ -1102,8 +1157,12 @@ kir::TensorIndex* Index::getProducerIndex(
     TensorView* producer,
     TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("Index::getProducerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (producer->domain()->noReductions().size() == 0) {
-    return new kir::TensorIndex(producer, {});
+    return ir_builder.create<kir::TensorIndex>(producer, std::vector<Val*>{});
   }
 
   if (producer->getMemoryType() == MemoryType::Global) {
@@ -1117,8 +1176,12 @@ kir::TensorIndex* Index::getProducerIndex(
 kir::TensorIndex* Index::getConsumerIndex(
     TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("Index::getConsumerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (consumer->domain()->noReductions().size() == 0) {
-    return new kir::TensorIndex(consumer, {});
+    return ir_builder.create<kir::TensorIndex>(consumer, std::vector<Val*>{});
   }
 
   if (consumer->getMemoryType() == MemoryType::Global) {
@@ -1135,6 +1198,10 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
     const std::vector<kir::ForLoop*>& loops,
     const std::vector<bool>& root_contiguity,
     bool unroll) {
+  FUSER_PERF_SCOPE("Index::getConsumerRootPredIndices");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // grab all tensor views from producer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
@@ -1148,7 +1215,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
 
   if (unroll) {
     bool within_unroll = false;
-    Val* one = new kir::Int(1);
+    Val* one = ir_builder.create<kir::Int>(1);
     for (auto loop : loops) {
       if (loop->iter_domain()->getParallelType() == ParallelType::Unroll) {
         within_unroll = true;
@@ -1156,7 +1223,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
 
       if (within_unroll && !loop->iter_domain()->isThread()) {
         loop_to_ind_map[loop] =
-            kir::subExpr(loop->iter_domain()->extent(), one);
+            ir_builder.subExpr(loop->iter_domain()->extent(), one);
       }
     }
   }
@@ -1179,7 +1246,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
     for (auto rfactor_id : rfactor_dom) {
       if (rfactor_id->isReduction()) {
         auto kir_rfactor_id =
-            kir::lowerValue(rfactor_id)->as<kir::IterDomain>();
+            GpuLower::lowerValue(rfactor_id)->as<kir::IterDomain>();
         if (index_map.find(kir_rfactor_id) != index_map.end()) {
           if (!index_map.at(kir_rfactor_id)->isZeroInt()) {
             use_rfactor = false;
@@ -1193,13 +1260,14 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
   auto root_dom = use_rfactor ? consumer_tv->getMaybeRFactorDomain()
                               : consumer_tv->getRootDomain();
 
-  std::vector<Val*> root_inds(root_dom.size(), new kir::Int(0));
+  std::vector<Val*> root_inds(root_dom.size(), ir_builder.create<kir::Int>(0));
   for (size_t i = 0; i < root_dom.size(); i++) {
     if (root_dom[i]->isBroadcast()) {
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
     if (index_map.find(kir_root_dom_i) != index_map.end()) {
       auto ind = index_map.at(kir_root_dom_i);
       TORCH_INTERNAL_ASSERT(kir::isLoweredScalar(ind))
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
new file mode 100644
index 000000000000..80a0c66075f0
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
@@ -0,0 +1,71 @@
+
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace inst {
+
+Trace::Trace() {
+  const char* trace_filename = getenv("PYTORCH_CUDA_FUSER_TRACE");
+  if (trace_filename != nullptr) {
+    log_file_ = fopen(trace_filename, "w");
+    TORCH_CHECK(log_file_ != nullptr, "Can't open trace file");
+
+    // Disable the file stream buffering, since it may result
+    // in torn writes in multi-threaded tracing
+    setbuf(log_file_, nullptr);
+
+    // Print the trace prologue
+    // (including a dummy TRACE_START event)
+    fprintf(log_file_, "{\n\"traceEvents\": [\n");
+    start_timestamp_ = Clock::now();
+    logEvent('I', "TRACE_START");
+  }
+}
+
+Trace::~Trace() {
+  if (log_file_ != nullptr) {
+    // Print trace epilogue
+    logEvent('I', "TRACE_END", ' ');
+    fprintf(log_file_, "],\n\"displayTimeUnit\": \"ms\"\n}\n");
+    fclose(log_file_);
+  }
+}
+
+void Trace::logEvent(char ph, const char* name, char sep) {
+  const std::chrono::duration<double> d = Clock::now() - start_timestamp_;
+  const double elapsed = d.count() * 1e6;
+
+#ifdef _WIN32
+  const unsigned int pid = GetCurrentProcessId();
+  const unsigned int tid = GetCurrentThreadId();
+#else
+  const unsigned int pid = getpid();
+  const unsigned int tid = pthread_self();
+#endif // _WIN32
+
+  fprintf(
+      log_file_,
+      "{ \"name\": \"%s\", \"ph\": \"%c\", \"pid\": %u, \"tid\": %u, \"ts\": %.0f }%c\n",
+      name,
+      ph,
+      pid,
+      tid,
+      elapsed,
+      sep);
+}
+
+} // namespace inst
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.h b/torch/csrc/jit/codegen/cuda/instrumentation.h
new file mode 100644
index 000000000000..b3c2454570ee
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.h
@@ -0,0 +1,93 @@
+
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+#include <stdio.h>
+#include <chrono>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace inst {
+
+//! An optional record of selected timestamped operations, events and counters
+//!
+//! This class is not intended to be used directly. Instead, the operations
+//! to be traced are marked (for example using the FUSER_PERF_SCOPE macro)
+//!
+//! In order to enable tracing, the `PYTORCH_CUDA_FUSER_TRACE` environment
+//! variable is set to point to a trace file (ex `test.trace`). The file name
+//! may be a relative or an absolute path.
+//!
+//! The trace uses the Chrome Tracing (Catapult) format, which is a well
+//! documented JSON based format supported by multiple tools:
+//! https://chromium.googlesource.com/catapult/+/HEAD/tracing/README.md
+//!
+//! An easy way to view traces is to type `about://tracing` in Chrome or
+//! Chromium.
+//!
+class Trace : public NonCopyable {
+ public:
+  using Clock = std::chrono::steady_clock;
+
+ public:
+  static Trace* instance() {
+    static Trace trace;
+    return &trace;
+  }
+
+  void beginEvent(const char* name) {
+    if (log_file_ != nullptr) {
+      logEvent('B', name);
+    }
+  }
+
+  void endEvent(const char* name) {
+    if (log_file_ != nullptr) {
+      logEvent('E', name);
+    }
+  }
+
+ private:
+  Trace();
+  ~Trace();
+
+  void logEvent(char ph, const char* name, char sep = ',');
+
+ private:
+  FILE* log_file_ = nullptr;
+  Clock::time_point start_timestamp_;
+};
+
+//! \internal Automatic scope for a perf marker
+//!   (normally used through the FUSER_PERF_SCOPE macro)
+class TraceScope : public NonCopyable {
+ public:
+  explicit TraceScope(const char* event_name) : event_name_(event_name) {
+    Trace::instance()->beginEvent(event_name_);
+  }
+
+  ~TraceScope() {
+    Trace::instance()->endEvent(event_name_);
+  }
+
+ private:
+  const char* event_name_ = nullptr;
+};
+
+#define FUSER_MACRO_CONCAT2(a, b) a##b
+#define FUSER_MACRO_CONCAT(a, b) FUSER_MACRO_CONCAT2(a, b)
+#define FUSER_ANONYMOUS(prefix) FUSER_MACRO_CONCAT(prefix, __COUNTER__)
+
+//! Defines a scope we want to measure and record in a perf trace
+//!
+//! \param name The name of the scope, normally a simple string literal
+//!
+#define FUSER_PERF_SCOPE(name) \
+  fuser::inst::TraceScope FUSER_ANONYMOUS(_perf_scope_)(name)
+
+} // namespace inst
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index 62a5fe5ceca5..42dfed02b114 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -1,3 +1,4 @@
+
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <ATen/core/dispatch/OperatorOptions.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
index f743db767d9a..9f6b3fdb50b6 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
@@ -36,7 +36,7 @@ Expr* Statement::asExpr() {
 }
 
 void Statement::print() const {
-  IRPrinter ir_printer(std::cout);
+  IrPrinter ir_printer(std::cout);
   ir_printer.handle(this);
   std::cout << std::endl;
 }
@@ -121,6 +121,26 @@ class ConstCheck : OptOutConstDispatch {
     is_const_ = is_const_ && false;
   }
 
+  void handle(const kir::Bool* b) override {
+    is_const_ = is_const_ && b->isConst();
+  }
+
+  void handle(const kir::Float* f) override {
+    is_const_ = is_const_ && f->isConst();
+  }
+
+  void handle(const kir::Half* h) override {
+    is_const_ = is_const_ && h->isConst();
+  }
+
+  void handle(const kir::Int* i) override {
+    is_const_ = is_const_ && i->isConst();
+  }
+
+  void handle(const kir::NamedScalar* ns) override {
+    is_const_ = is_const_ && false;
+  }
+
   void handle(const Expr* expr) override {
     for (auto inp : expr->inputs()) {
       handle(inp);
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
index 06d78dc48fae..2719cd056f95 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
@@ -36,6 +36,7 @@ namespace jit {
 namespace fuser {
 
 using StmtNameType = unsigned int;
+
 constexpr StmtNameType UNINITIALIZED_STMTNAMETYPE =
     std::numeric_limits<unsigned int>::max();
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
index ad85dc4642ab..17efc3e692e7 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
@@ -114,82 +114,6 @@ void IrCloner::handle(const Merge* merge) {
   clone_ = new Merge(merge, this);
 }
 
-void IrCloner::handle(const kir::Bool* node) {
-  clone_ = new kir::Bool(node, this);
-}
-
-void IrCloner::handle(const kir::Float* node) {
-  clone_ = new kir::Float(node, this);
-}
-
-void IrCloner::handle(const kir::Half* node) {
-  clone_ = new kir::Half(node, this);
-}
-
-void IrCloner::handle(const kir::Int* node) {
-  clone_ = new kir::Int(node, this);
-}
-
-void IrCloner::handle(const kir::NamedScalar* node) {
-  clone_ = new kir::NamedScalar(node, this);
-}
-
-void IrCloner::handle(const kir::IterDomain* node) {
-  clone_ = new kir::IterDomain(node, this);
-}
-
-void IrCloner::handle(const kir::TensorDomain* node) {
-  clone_ = new kir::TensorDomain(node, this);
-}
-
-void IrCloner::handle(const kir::TensorView* node) {
-  clone_ = new kir::TensorView(node, this);
-}
-
-void IrCloner::handle(const kir::UnaryOp* node) {
-  clone_ = new kir::UnaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::BinaryOp* node) {
-  clone_ = new kir::BinaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::TernaryOp* node) {
-  clone_ = new kir::TernaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::ReductionOp* node) {
-  clone_ = new kir::ReductionOp(node, this);
-}
-
-void IrCloner::handle(const kir::BroadcastOp* node) {
-  clone_ = new kir::BroadcastOp(node, this);
-}
-
-void IrCloner::handle(const kir::TensorIndex* node) {
-  clone_ = new kir::TensorIndex(node, this);
-}
-
-void IrCloner::handle(const kir::Allocate* node) {
-  clone_ = new kir::Allocate(node, this);
-}
-
-void IrCloner::handle(const kir::Sync* node) {
-  clone_ = new kir::Sync(node, this);
-}
-
-void IrCloner::handle(const kir::ForLoop* node) {
-  clone_ = new kir::ForLoop(node, this);
-}
-
-void IrCloner::handle(const kir::IfThenElse* node) {
-  clone_ = new kir::IfThenElse(node, this);
-}
-
-void IrCloner::handle(const kir::GridReduction* node) {
-  clone_ = new kir::GridReduction(node, this);
-}
-
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h
index 25b101d612c8..39435aab4e65 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.h
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h
@@ -67,29 +67,6 @@ class TORCH_CUDA_API IrCloner : private OptInConstDispatch {
   void handle(const Split*) override;
   void handle(const Merge*) override;
 
-  void handle(const kir::Bool*) override;
-  void handle(const kir::Float*) override;
-  void handle(const kir::Half*) override;
-  void handle(const kir::Int*) override;
-  void handle(const kir::NamedScalar*) override;
-
-  void handle(const kir::IterDomain*) override;
-  void handle(const kir::TensorDomain*) override;
-  void handle(const kir::TensorView*) override;
-
-  void handle(const kir::UnaryOp*) override;
-  void handle(const kir::BinaryOp*) override;
-  void handle(const kir::TernaryOp*) override;
-  void handle(const kir::ReductionOp*) override;
-  void handle(const kir::BroadcastOp*) override;
-
-  void handle(const kir::TensorIndex*) override;
-  void handle(const kir::Allocate*) override;
-  void handle(const kir::Sync*) override;
-  void handle(const kir::ForLoop*) override;
-  void handle(const kir::IfThenElse*) override;
-  void handle(const kir::GridReduction*) override;
-
  private:
   // The destination Fusion container
   Fusion* fusion_ = nullptr;
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
index bb3335fa1b89..488e626299ad 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -333,17 +333,6 @@ void IrGraphGenerator::handle(const IterDomain* id) {
   }
 }
 
-void IrGraphGenerator::handle(const kir::TensorIndex* ti) {
-  graph_def_ << "    " << getid(ti) << " [label=\"TensorIndex\", "
-             << "shape=rarrow, color=gray, fontsize=10];\n";
-
-  addArc(ti, ti->view());
-
-  for (const auto index : ti->indices()) {
-    addArc(index, ti);
-  }
-}
-
 void IrGraphGenerator::handle(const Bool* b) {
   printValue(b, IrNodeLabel::gen(b, detail_level_));
 }
@@ -453,45 +442,6 @@ void IrGraphGenerator::handle(const ReductionOp* op) {
   addArc(op, op->out());
 }
 
-void IrGraphGenerator::handle(const kir::GridReduction* op) {
-  printExpr(op, "Grid Reduction");
-
-  // inputs & outputs
-  addArc(op, op->reduction_op());
-  addArc(op->reduction_buffer(), op);
-  addArc(op->sync_buffer(), op);
-}
-
-void IrGraphGenerator::handle(const kir::ForLoop* for_loop) {
-  printExpr(for_loop, "ForLoop");
-  addArc(for_loop->index(), for_loop);
-  addArc(for_loop->iter_domain(), for_loop);
-  if (for_loop->parentScope()) {
-    addArc(for_loop, for_loop->parentScope());
-  }
-}
-
-void IrGraphGenerator::handle(const kir::IfThenElse* if_then_else) {
-  printExpr(if_then_else, "IfThenElse");
-  addArc(if_then_else->cond(), if_then_else);
-  if (if_then_else->parentScope()) {
-    addArc(if_then_else, if_then_else->parentScope());
-  }
-}
-
-void IrGraphGenerator::handle(const kir::Allocate* allocate) {
-  std::stringstream msg;
-  msg << "Allocate( memory type = " << allocate->getMemoryType() << ")";
-
-  printExpr(allocate, msg.str());
-  addArc(allocate->size(), allocate);
-  addArc(allocate->buffer(), allocate);
-}
-
-void IrGraphGenerator::handle(const kir::Sync* sync) {
-  printExpr(sync, "SyncThreads");
-}
-
 void IrGraphGenerator::handle(const Split* split) {
   printExpr(split, IrNodeLabel::gen(split));
   addArc(split->in(), split);
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
index 1940ea0a2a5b..e3c41fb525ff 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
@@ -66,7 +66,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const TensorDomain*) override;
   void handle(const TensorView*) override;
   void handle(const IterDomain*) override;
-  void handle(const kir::TensorIndex*) override;
 
   void handle(const Bool*) override;
   void handle(const Float*) override;
@@ -79,12 +78,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const TernaryOp*) override;
   void handle(const BroadcastOp*) override;
   void handle(const ReductionOp*) override;
-  void handle(const kir::GridReduction*) override;
-
-  void handle(const kir::ForLoop*) override;
-  void handle(const kir::IfThenElse*) override;
-  void handle(const kir::Allocate*) override;
-  void handle(const kir::Sync*) override;
 
   void handle(const Split*) override;
   void handle(const Merge*) override;
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
index 5e1ebf3f5bfe..4186f7dfcd88 100644
--- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
@@ -4,6 +4,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
 
 #include <torch/csrc/jit/ir/ir.h>
 
@@ -208,7 +209,10 @@ class TORCH_CUDA_API TensorView : public Val {
   TensorView(TensorView&& other) = delete;
   TensorView& operator=(TensorView&& other) = delete;
 
-  TensorView(TensorDomain* _domain, DataType dtype);
+  TensorView(
+      TensorDomain* _domain,
+      DataType dtype,
+      MemoryType mtype = MemoryType::Local);
 
   TensorView(const std::shared_ptr<c10::TensorType>& tensor_type);
 
@@ -224,6 +228,7 @@ class TORCH_CUDA_API TensorView : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
@@ -405,7 +410,7 @@ class TORCH_CUDA_API TensorView : public Val {
   // compute at axis in compute at view
   unsigned int relative_compute_at_axis_ = 0;
   unsigned int this_compute_at_axis_ = 0;
-  MemoryType memory_type_ = MemoryType::Global;
+  MemoryType memory_type_ = MemoryType::Local;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
index 10446b123532..ca71fd6c2d62 100644
--- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
@@ -257,6 +257,12 @@ class TORCH_CUDA_API IterDomain : public Val {
   // directly, users should not be able to use this call
   static std::pair<IterDomain*, IterDomain*> split(IterDomain* in, Val* factor);
 
+  // Run concretization pass and return the concretized domain of broadcast id
+  static const IterDomain* concretizeDomain(IterDomain* bcast_dom);
+
+  // Attempt to prove 2 IterDomains are equal in start and rawExtent
+  static bool proveEquivalent(IterDomain* a, IterDomain* b);
+
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
   }
@@ -384,6 +390,11 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
   TensorDomain(const TensorDomain* src, IrCloner* ir_cloner);
 
+  bool operator==(const TensorDomain& other) const;
+  bool operator!=(const TensorDomain& other) const {
+    return !(*this == other);
+  }
+
   std::vector<IterDomain*>::size_type nDims() const {
     return domain_.size();
   }
@@ -413,6 +424,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
index 66aeec2c5bd1..e82e3fd5baa4 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -1,11 +1,10 @@
+
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
-#include <iostream>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -25,335 +24,191 @@ static void checkInlineable(const Expr* expr) {
       "Printing inline computations involving values other than scalars is not currently supported.");
 }
 
-void IRPrinter::handle(const Statement* s) {
+void IrPrinter::handle(const Statement* s) {
   OptInConstDispatch::handle(s);
 }
 
-void IRPrinter::handle(const Val* v) {
+void IrPrinter::handle(const Val* v) {
   OptInConstDispatch::handle(v);
 }
 
-void IRPrinter::handle(const Expr* e) {
+void IrPrinter::handle(const Expr* e) {
   OptInConstDispatch::handle(e);
 }
 
-void IRPrinter::printHeader(
-    Fusion* fusion,
-    const std::string& kernel_name_,
-    const std::vector<Val*>& global_buffers) {
-  os << "__global__ void " << kernel_name_ << "(";
-
-  std::vector<Val*> vals;
-
-  for (auto val : fusion->inputs()) {
-    vals.push_back(val);
-  }
-  for (auto val : fusion->outputs()) {
-    vals.push_back(val);
-  }
-
-  for (auto val : global_buffers) {
-    vals.push_back(val);
-  }
-
-  for (Val* val : vals) {
-    switch (val->getValType().value()) {
-      case ValType::TensorView:
-        os << "Tensor<" << val->getDataType().value() << ", "
-           << TensorDomain::noReductions(val->as<TensorView>()->getRootDomain())
-                  .size()
-           << "> T" << val->name();
-        break;
-      case ValType::KirTensorView:
-        os << "Tensor<" << val->getDataType().value() << ", "
-           << kir::TensorDomain::noReductions(
-                  val->as<kir::TensorView>()->domain()->rootDomain())
-                  .size()
-           << "> T" << val->name();
-        break;
-      case ValType::Scalar:
-        os << val->getDataType().value() << " " << val;
-        break;
-      default:
-        TORCH_CHECK(
-            false,
-            "printHeader() found an input to the fusion of unexpected data type.");
-    }
-
-    if (val != vals.back())
-      os << ", ";
-  }
-
-  if (fusion->hasRNG())
-    os << ", unsigned long long seed, unsigned long long offset";
-
-  os << "){\n";
-  indent_size++;
-  if (fusion->hasRNG()) {
-    indent();
-    os << "int idx = blockIdx.x*blockDim.x + threadIdx.x;\n";
-    indent();
-    os << "Philox rnd(seed, idx, offset);\n";
-  }
-  if (fusion->hasBlockReduction() || fusion->hasGridReduction()) {
-    indent();
-    // TODO: Dynamic sizing possible? blockReduce originally used 1024
-    // values of a given type
-    os << "__shared__ float shared_mem[1024];\n";
-  }
-}
-
-void IRPrinter::handle(Fusion* fusion) {
+void IrPrinter::handle(Fusion* fusion) {
+  FUSER_PERF_SCOPE("IrPrinter");
   resetIndent();
   for (const Expr* expr : fusion->exprs()) {
     handle(expr);
   }
 }
 
-void IRPrinter::handle(const TensorDomain* td) {
+void IrPrinter::handle(const TensorDomain* td) {
   if (td->nDims() == 0) {
-    os << "[ 0 ]";
+    os_ << "[ 0 ]";
     return;
   }
-  os << "[ ";
+  os_ << "[ ";
   for (size_t i = 0; i < td->nDims(); i++) {
     handle(td->axis(i));
     if (i != td->nDims() - 1)
-      os << ", ";
+      os_ << ", ";
   }
-  os << " ]";
+  os_ << " ]";
 }
 
-void IRPrinter::handle(const TensorView* tv) {
+void IrPrinter::handle(const TensorView* tv) {
   if (tv->nDims() == 0) {
     switch (tv->getDataType().value()) {
       case DataType::Bool:
-        os << "b";
+        os_ << "b";
         break;
       case DataType::Float:
-        os << "f";
+        os_ << "f";
         break;
       case DataType::Half:
-        os << "h";
+        os_ << "h";
         break;
       case DataType::Int:
-        os << "i";
+        os_ << "i";
         break;
       default:
         TORCH_INTERNAL_ASSERT(
             false, "Did not recognize type ", tv->getDataType().value());
     }
-    os << tv->name();
+    os_ << tv->name();
 
   } else {
-    os << "T" << tv->name();
+    os_ << "T" << tv->name();
     handle(tv->domain());
 
     if (tv->getComputeAtView() != nullptr) {
-      os << " compute_at( ";
-      os << "T" << tv->getComputeAtView()->name();
-      os << ", " << tv->getRelativeComputeAtAxis() << " )";
+      os_ << " compute_at( ";
+      os_ << "T" << tv->getComputeAtView()->name();
+      os_ << ", " << tv->getRelativeComputeAtAxis() << " )";
     }
   }
 }
 
-void IRPrinter::handle(const IterDomain* id) {
-  os << id->getIterType();
-  os << id->getParallelType();
-  os << "{";
+void IrPrinter::handle(const IterDomain* id) {
+  os_ << id->getIterType();
+  os_ << id->getParallelType();
+  os_ << id->name();
+  os_ << "{";
   if (!id->start()->isZeroInt()) {
     print_inline(id->start());
-    os << " : ";
+    os_ << " : ";
   }
   print_inline(id->extent());
-  os << "}";
+  os_ << "}";
   if (id->isRFactorProduct())
-    os << "rf";
-}
-
-void IRPrinter::handle(const kir::TensorIndex* ti) {
-  os << "T" << ti->view()->name();
-  std::vector<Val*> non_zero_inds;
-  for (auto* ind : ti->indices()) {
-    if (!ind->isZeroInt()) {
-      non_zero_inds.push_back(ind);
-    }
-  }
-
-  if (non_zero_inds.size() == 0) {
-    os << "[ 0 ]";
-    return;
-  }
-
-  os << "[ ";
-  bool first = true;
-  for (auto* ind : non_zero_inds) {
-    if (!first)
-      os << " + ";
-    print_inline(ind);
-    first = false;
-  }
-  os << " ]";
+    os_ << "rf";
 }
 
-void IRPrinter::handle(const Bool* b) {
+void IrPrinter::handle(const Bool* b) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(b) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(b));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (b->isSymbolic()) {
-    os << "b" << b->name();
+    os_ << "b" << b->name();
   } else {
-    os << "bool(" << *(b->value()) << ")";
+    os_ << "bool(" << *(b->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Float* f) {
+void IrPrinter::handle(const Float* f) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(f) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(f));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (f->isSymbolic()) {
-    os << "f" << f->name();
+    os_ << "f" << f->name();
   } else {
-    os << "float("
-       << std::setprecision(
-              std::numeric_limits<Float::ScalarType>::max_digits10)
-       << *(f->value()) << ")";
+    os_ << "float("
+        << std::setprecision(
+               std::numeric_limits<Float::ScalarType>::max_digits10)
+        << *(f->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Half* h) {
+void IrPrinter::handle(const Half* h) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(h) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(h));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (h->isSymbolic()) {
-    os << "h" << h->name();
+    os_ << "h" << h->name();
   } else {
-    os << "__float2half(" << *(h->value()) << ")";
+    os_ << "__float2half(" << *(h->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Int* i) {
+void IrPrinter::handle(const Int* i) {
   if (print_inline_) {
     if (auto def = FusionGuard::getCurFusion()->origin(i)) {
-      os << "( ";
+      os_ << "( ";
       handle(def);
-      os << " )";
+      os_ << " )";
       return;
     }
   }
 
   if (i->isSymbolic()) {
-    os << "i" << i->name();
+    os_ << "i" << i->name();
   } else {
-    os << *(i->value());
+    os_ << *(i->value());
   }
 }
 
-void IRPrinter::handle(const NamedScalar* i) {
-  os << i->name();
+void IrPrinter::handle(const NamedScalar* i) {
+  os_ << i->name();
 }
 
-void IRPrinter::handle(const kir::Bool* b) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(b) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(b));
-    os << " )";
-    return;
-  }
-
-  if (b->isSymbolic()) {
-    os << "b" << b->name();
-  } else {
-    os << "bool(" << *(b->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Bool* b) {
+  os_ << "kir::Bool";
 }
 
-void IRPrinter::handle(const kir::Float* f) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(f) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(f));
-    os << " )";
-    return;
-  }
-
-  if (f->isSymbolic()) {
-    os << "f" << f->name();
-  } else {
-    os << "float("
-       << std::setprecision(
-              std::numeric_limits<Float::ScalarType>::max_digits10)
-       << *(f->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Float* f) {
+  os_ << "kir::Float";
 }
 
-void IRPrinter::handle(const kir::Half* h) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(h) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(h));
-    os << " )";
-    return;
-  }
-
-  if (h->isSymbolic()) {
-    os << "h" << h->name();
-  } else {
-    os << "__float2half(" << *(h->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Half* h) {
+  os_ << "kir::Half";
 }
 
-void IRPrinter::handle(const kir::Int* i) {
-  if (print_inline_) {
-    if (auto def = FusionGuard::getCurFusion()->origin(i)) {
-      os << "( ";
-      handle(def);
-      os << " )";
-      return;
-    }
-  }
+void IrPrinter::handle(const kir::Int* i) {
+  os_ << "kir::Int";
+}
 
-  if (i->isSymbolic()) {
-    os << "i" << i->name();
-  } else {
-    os << *(i->value());
-  }
+void IrPrinter::handle(const kir::NamedScalar*) {
+  os_ << "kir::NamedScalar";
 }
 
-void IRPrinter::handle(const kir::NamedScalar* i) {
-  os << i->name();
+void IrPrinter::handle(const kir::TensorIndex*) {
+  os_ << "kir::TensorIndex";
 }
 
-void IRPrinter::handle(const kir::IterDomain* id) {
-  os << id->getIterType();
-  os << id->getParallelType();
-  os << "{";
-  if (!id->start()->isZeroInt()) {
-    print_inline(id->start());
-    os << " : ";
-  }
-  print_inline(id->extent());
-  os << "}";
-  if (id->isRFactorProduct())
-    os << "rf";
+void IrPrinter::handle(const kir::IterDomain*) {
+  os_ << "kir::IterDomain";
 }
 
-void IRPrinter::handle(const kir::TensorDomain*) {
-  TORCH_INTERNAL_ASSERT(false, "Unreachable");
+void IrPrinter::handle(const kir::TensorDomain*) {
+  os_ << "kir::TensorDomain";
 }
 
-void IRPrinter::handle(const kir::TensorView*) {
-  TORCH_INTERNAL_ASSERT(false, "Unreachable");
+void IrPrinter::handle(const kir::TensorView*) {
+  os_ << "kir::TensorView";
 }
 
 static bool isTV(const Val* val) {
@@ -366,62 +221,62 @@ static bool isTVOp(const Expr* expr) {
   return expr->outputs().size() == 1 && isTV(expr->outputs().front());
 }
 
-void IRPrinter::handle(const UnaryOp* uop) {
+void IrPrinter::handle(const UnaryOp* uop) {
   bool istvop = isTVOp(uop);
   if (!print_inline_) {
     indent();
-    os << uop->out();
+    os_ << uop->out();
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(uop);
   }
 
   if (auto inline_uop = inline_op_str(uop->getUnaryOpType())) {
-    os << inline_uop.value();
+    os_ << inline_uop.value();
     handle(uop->in());
   } else {
     if (uop->getUnaryOpType() == UnaryOpType::Cast) {
       c10::optional<std::string> cast_str = cast_func_str(std::make_pair(
           uop->in()->getDataType().value(), uop->out()->getDataType().value()));
       TORCH_INTERNAL_ASSERT(cast_str != c10::nullopt, "Unsupported Cast");
-      os << cast_str.value();
+      os_ << cast_str.value();
     } else {
-      os << uop->getUnaryOpType();
+      os_ << uop->getUnaryOpType();
     }
-    os << "(";
+    os_ << "(";
     if (uop->getUnaryOpType() == UnaryOpType::RandLike)
-      os << "rnd";
+      os_ << "rnd";
     else
       handle(uop->in());
-    os << ")";
+    os_ << ")";
   }
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const BinaryOp* bop) {
+void IrPrinter::handle(const BinaryOp* bop) {
   bool istvop = isTVOp(bop);
   if (!print_inline_) {
     indent();
-    os << bop->out();
+    os_ << bop->out();
 
     // tensor operations tend to be long, break them up into multiple lines
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
 
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(bop);
   }
@@ -429,546 +284,153 @@ void IRPrinter::handle(const BinaryOp* bop) {
   if (auto inline_bop = inline_op_str(bop->getBinaryOpType())) {
     handle(bop->lhs());
     if (istvop) {
-      os << "\n";
+      os_ << "\n";
       indent();
     }
-    os << " " << inline_bop.value() << " ";
+    os_ << " " << inline_bop.value() << " ";
     handle(bop->rhs());
   } else {
-    os << bop->getBinaryOpType() << "(";
+    os_ << bop->getBinaryOpType() << "(";
     handle(bop->lhs());
     if (istvop) {
-      os << "\n";
+      os_ << "\n";
       indent();
     }
-    os << ", ";
+    os_ << ", ";
     handle(bop->rhs());
-    os << ")";
+    os_ << ")";
   }
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const TernaryOp* top) {
+void IrPrinter::handle(const TernaryOp* top) {
   bool istvop = isTVOp(top);
   if (!print_inline_) {
     indent();
-    os << top->out();
+    os_ << top->out();
 
     // tensor operations tend to be long, break them up into multiple lines
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
 
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(top);
   }
 
-  os << top->getTernaryOpType() << "(";
+  os_ << top->getTernaryOpType() << "(";
   handle(top->in1());
   if (istvop) {
-    os << "\n";
+    os_ << "\n";
     indent();
   }
-  os << ", ";
+  os_ << ", ";
   handle(top->in2());
   if (istvop) {
-    os << "\n";
+    os_ << "\n";
     indent();
   }
-  os << ", ";
+  os_ << ", ";
   handle(top->in3());
-  os << ")";
+  os_ << ")";
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const kir::UnaryOp* uop) {
-  bool istvop = isTVOp(uop);
-  if (!print_inline_) {
-    indent();
-    os << uop->out();
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-    os << " = ";
-  } else {
-    checkInlineable(uop);
-  }
-
-  if (auto inline_uop = inline_op_str(uop->getUnaryOpType())) {
-    os << inline_uop.value();
-    handle(uop->in());
-  } else {
-    if (uop->getUnaryOpType() == UnaryOpType::Cast) {
-      c10::optional<std::string> cast_str = cast_func_str(std::make_pair(
-          uop->in()->getDataType().value(), uop->out()->getDataType().value()));
-      TORCH_INTERNAL_ASSERT(cast_str != c10::nullopt, "Unsupported Cast");
-      os << cast_str.value();
-    } else {
-      os << uop->getUnaryOpType();
-    }
-    os << "(";
-    if (uop->getUnaryOpType() == UnaryOpType::RandLike)
-      os << "rnd";
-    else
-      handle(uop->in());
-    os << ")";
-  }
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::UnaryOp* uop) {
+  os_ << "kir::UnaryOp";
 }
 
-void IRPrinter::handle(const kir::BinaryOp* bop) {
-  bool istvop = isTVOp(bop);
-  if (!print_inline_) {
-    indent();
-    os << bop->out();
-
-    // tensor operations tend to be long, break them up into multiple lines
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-
-    os << " = ";
-  } else {
-    checkInlineable(bop);
-  }
-
-  if (auto inline_bop = inline_op_str(bop->getBinaryOpType())) {
-    handle(bop->lhs());
-    if (istvop) {
-      os << "\n";
-      indent();
-    }
-    os << " " << inline_bop.value() << " ";
-    handle(bop->rhs());
-  } else {
-    os << bop->getBinaryOpType() << "(";
-    handle(bop->lhs());
-    if (istvop) {
-      os << "\n";
-      indent();
-    }
-    os << ", ";
-    handle(bop->rhs());
-    os << ")";
-  }
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::BinaryOp* bop) {
+  os_ << "kir::BinaryOp";
 }
 
-void IRPrinter::handle(const kir::TernaryOp* top) {
-  bool istvop = isTVOp(top);
-  if (!print_inline_) {
-    indent();
-    os << top->out();
-
-    // tensor operations tend to be long, break them up into multiple lines
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-
-    os << " = ";
-  } else {
-    checkInlineable(top);
-  }
-
-  os << top->getTernaryOpType() << "(";
-  handle(top->in1());
-  if (istvop) {
-    os << "\n";
-    indent();
-  }
-  os << ", ";
-  handle(top->in2());
-  if (istvop) {
-    os << "\n";
-    indent();
-  }
-  os << ", ";
-  handle(top->in3());
-  os << ")";
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::TernaryOp* top) {
+  os_ << "kir::TernaryOp";
 }
 
-void IRPrinter::handle(const ReductionOp* rop) {
+void IrPrinter::handle(const ReductionOp* rop) {
   TORCH_CHECK(rop->out()->getValType() != ValType::TensorIndex);
   indent();
-  os << rop->out() << " = reduction( " << rop->in()
-     << ", op = " << rop->getReductionOpType()
-     << ", initial value = " << rop->init() << " )\n";
+  os_ << rop->out() << " = reduction( " << rop->in()
+      << ", op = " << rop->getReductionOpType()
+      << ", initial value = " << rop->init() << " )\n";
 }
 
-void IRPrinter::handle(const kir::ReductionOp* rop) {
-  TORCH_CHECK(rop->out()->getValType() == ValType::TensorIndex);
-
-  const auto out = rop->out()->as<kir::TensorIndex>();
-  const auto domain = out->view()->domain();
-
-  const bool has_block_reduce = domain->hasBlockReduction();
-  const bool has_grid_reduce = domain->hasGridReduction();
-
-  if (!has_block_reduce && !has_grid_reduce) {
-    FusionGuard fg(rop->fusion());
-    handle(new BinaryOp(rop->getReductionOpType(), out, out, rop->in()));
-    return;
-  }
-
-  auto par_domains = rop->getParallelReductionDomains();
-  bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
-  bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
-  bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
-
-  auto d_type = rop->out()->getDataType().value();
-  auto op_type = rop->getReductionOpType();
-  const std::string block_result = "block_result";
-  if (has_block_reduce) {
-    if (has_grid_reduce) {
-      indent();
-      os << d_type << " " << block_result << ";\n";
-    }
-    indent();
-    // Thread all reduce.
-    os << "blockReduce< " << (tidx ? "true" : "false") << ", "
-       << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") << " >"
-       << " ( ";
-    if (has_grid_reduce) {
-      os << block_result;
-    } else {
-      handle(rop->out());
-    }
-    os << ", ";
-    handle(rop->in());
-    os << ", ";
-    os << "reduction_" << op_type << "_" << d_type;
-    os << ", threadIdx, blockDim";
-    os << ", reinterpret_cast<" << d_type << "*>(shared_mem)";
-    os << ");\n";
-  }
+void IrPrinter::handle(const kir::ReductionOp* rop) {
+  os_ << "kir::ReductionOp";
 }
 
-void IRPrinter::handle(const kir::GridReduction* gr) {
-  // Check if we've lowered yet.
-  const auto rop = gr->reduction_op();
-  TORCH_INTERNAL_ASSERT(
-      rop->out()->getValType() == ValType::TensorIndex,
-      "GridReduction node is a lowered node but did not find the output to be a TensorIndex.");
-
-  const auto out = rop->out()->as<kir::TensorIndex>();
-  const auto domain = out->view()->domain();
-  TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
-
-  const auto par_domains = rop->getParallelReductionDomains();
-  const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
-  const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
-  const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
-  const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end();
-  const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end();
-  const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end();
-
-  const auto d_type = rop->out()->getDataType().value();
-  const auto op_type = rop->getReductionOpType();
-  TORCH_INTERNAL_ASSERT(
-      gr->reduction_buffer()->buffer()->getValType().value() ==
-      ValType::KirTensorView);
-  TORCH_INTERNAL_ASSERT(
-      gr->sync_buffer()->buffer()->getValType().value() ==
-      ValType::KirTensorView);
-  const auto work_buffer =
-      gr->reduction_buffer()->buffer()->as<kir::TensorView>();
-  const auto sync_buffer = gr->sync_buffer()->buffer()->as<kir::TensorView>();
-  indent();
-  // Since block-level reduction is already done, those dimensions
-  // with tidx/y/z being true do not participate in the grid reduction.
-  os << kir::GridReduction::getPredicateFlagName(out->view()) << " = "
-     << "reduction::gridReduce< " << (bidx ? "true" : "false") << ", "
-     << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false") << ", "
-     << (!tidx ? "true" : "false") << ", " << (!tidy ? "true" : "false") << ", "
-     << (!tidz ? "true" : "false") << " >"
-     << " ( ";
-  handle(rop->out());
-  os << ", ";
-  if (domain->hasBlockReduction()) {
-    os << "block_result";
-  } else {
-    handle(rop->in());
-  }
-  os << ", ";
-  os << "reduction_" << op_type << "_" << d_type;
-  os << ", &T" << work_buffer->name() << "[0]";
-  os << ", T" << sync_buffer->name() << "";
-  os << ", reinterpret_cast<" << d_type << "*>(shared_mem)";
-  os << ");\n";
+void IrPrinter::handle(const kir::GridReduction* gr) {
+  os_ << "kir::GridReduction";
 }
 
-void IRPrinter::handle(const BroadcastOp* bop) {
+void IrPrinter::handle(const BroadcastOp* bop) {
   TORCH_CHECK(bop->out()->getValType() != ValType::TensorIndex);
   indent();
-  os << bop->out() << " = broadcast( " << bop->in() << " )\n";
+  os_ << bop->out() << " = broadcast( " << bop->in() << " )\n";
 }
 
-void IRPrinter::handle(const kir::BroadcastOp* bop) {
-  TORCH_CHECK(bop->out()->getValType() == ValType::TensorIndex);
-
-  const ir_utils::ParallelTypeBitmap domains =
-      ir_utils::getParallelBroadcastDomains(
-          bop->out(), getThreadPredicateMap());
-  const bool thread_x = domains.get(ParallelType::TIDx);
-  const bool thread_y = domains.get(ParallelType::TIDy);
-  const bool thread_z = domains.get(ParallelType::TIDz);
-  const bool block_x = domains.get(ParallelType::BIDx);
-  const bool block_y = domains.get(ParallelType::BIDy);
-  const bool block_z = domains.get(ParallelType::BIDz);
-
-  const bool grid_broadcast_needed = block_x || block_y || block_z;
-  const bool block_broadcast_needed = thread_x || thread_y || thread_z;
-
-  TORCH_INTERNAL_ASSERT(
-      !grid_broadcast_needed, "Parallel broadcast across blocks not supported");
-
-  if (block_broadcast_needed) {
-    indent();
-    os << "broadcast::blockBroadcast<";
-    os << (thread_x ? "true" : "false") << ", ";
-    os << (thread_y ? "true" : "false") << ", ";
-    os << (thread_z ? "true" : "false");
-    os << ">(";
-    handle(bop->out());
-    os << ", ";
-    handle(bop->in());
-    os << ");\n";
-  } else {
-    indent();
-    handle(bop->out());
-    os << "\n";
-    indent_size++;
-    indent();
-    os << " = ";
-    handle(bop->in());
-    indent_size--;
-    os << ";\n";
-  }
+void IrPrinter::handle(const kir::BroadcastOp*) {
+  os_ << "kir::BroadcastOp";
 }
 
-void IRPrinter::handle(const kir::ForLoop* fl) {
-  if (fl->iter_domain()->isThread() || fl->iter_domain()->isBroadcast()) {
-    for (auto& expr : fl->constBody().exprs())
-      handle(expr);
-    return;
-  }
-
-  indent();
-  os << "for(size_t ";
-  handle(fl->index());
-  os << " = ";
-  print_inline(fl->iter_domain()->start());
-  os << "; ";
-  handle(fl->index());
-  os << " < ";
-  print_inline(fl->iter_domain()->extent());
-  os << "; ++";
-  handle(fl->index());
-  os << " ) {\n";
-  indent_size++;
-  for (auto& expr : fl->constBody().exprs())
-    handle(expr);
-
-  indent_size--;
-  indent();
-  os << "}\n";
+void IrPrinter::handle(const kir::ForLoop* fl) {
+  os_ << "kir::ForLoop";
 }
 
-void IRPrinter::handle(const kir::IfThenElse* ite) {
-  indent();
-
-  // IF
-  os << "if ( ";
-  print_inline(ite->cond());
-  os << " ) {\n";
-
-  indent_size++;
-  for (auto& expr : ite->constBody().exprs()) {
-    handle(expr);
-  }
-  indent_size--;
-
-  // ELSE
-  if (ite->hasElse()) {
-    indent();
-    os << "} else {\n";
-    indent_size++;
-    for (auto& expr : ite->constElseBody().exprs()) {
-      handle(expr);
-    }
-    indent_size--;
-  }
-  indent();
-  os << "}\n";
+void IrPrinter::handle(const kir::IfThenElse* ite) {
+  os_ << "kir::IfThenElse";
 }
 
-void IRPrinter::handle(const kir::Allocate* a) {
-  indent();
-  if (a->buffer()->getValType().value() == ValType::KirTensorView) {
-    const auto tv = a->buffer()->as<kir::TensorView>();
-    TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0);
-    TORCH_INTERNAL_ASSERT(a->size() != nullptr);
-    switch (tv->getMemoryType()) {
-      case MemoryType::Global:
-        os << "// Allocate global tensor ";
-        break;
-      case MemoryType::Shared:
-        os << "__shared__ ";
-        break;
-      case MemoryType::Local:
-        break;
-    }
-    os << a->buffer_type();
-    os << " T" << tv->name() << "[";
-    print_inline(a->size());
-    os << "];\n";
-  } else {
-    os << a->buffer_type() << " ";
-    handle(a->buffer());
-    os << ";\n";
-  }
+void IrPrinter::handle(const kir::Allocate* a) {
+  os_ << "kir::Allocate";
 }
 
-void IRPrinter::handle(const kir::Sync* a) {
-  indent();
-  os << "__syncthreads();\n";
+void IrPrinter::handle(const kir::Sync* a) {
+  os_ << "kir::Sync";
 }
 
-void IRPrinter::handle(const Split* s) {
-  os << "Split: ";
+void IrPrinter::handle(const Split* s) {
+  os_ << "Split: ";
   handle(s->in());
-  os << " by factor " << s->factor() << " -> ";
+  os_ << " by factor " << s->factor() << " -> ";
   handle(s->outer());
-  os << ", ";
+  os_ << ", ";
   handle(s->inner());
-  os << "\n";
+  os_ << "\n";
 }
 
-void IRPrinter::handle(const Merge* m) {
-  os << "Merge: ";
+void IrPrinter::handle(const Merge* m) {
+  os_ << "Merge: ";
   handle(m->outer());
-  os << " and ";
+  os_ << " and ";
   handle(m->inner());
-  os << " -> ";
+  os_ << " -> ";
   handle(m->out());
-  os << "\n";
-}
-
-namespace {
-
-class ReductionOps : OptOutDispatch {
- public:
-  std::set<std::pair<BinaryOpType, DataType>> rops;
-  void handle(ReductionOp* rop) override {
-    rops.emplace(std::pair<BinaryOpType, DataType>{
-        rop->getReductionOpType(), rop->in()->getDataType().value()});
-  }
-
-  using OptOutDispatch::handle;
-
-  static std::set<std::pair<BinaryOpType, DataType>> get(Fusion* fusion) {
-    ReductionOps ROPs;
-    for (auto expr : fusion->exprs(true)) {
-      ROPs.handle(expr);
-    }
-    return ROPs.rops;
-  }
-};
-
-} // namespace
-
-void IRPrinter::printReductionOps(Fusion* fusion) {
-  FusionGuard fg(fusion);
-  auto a = new NamedScalar("a", DataType::Null);
-  auto b = new NamedScalar("b", DataType::Null);
-  for (auto rop_pair : ReductionOps::get(fusion)) {
-    auto op_type = rop_pair.first;
-    auto d_type = rop_pair.second;
-
-    indent();
-    os << "__device__ void reduction_" << op_type << "_" << d_type << "("
-       << d_type << "& a, "
-       << "const " << d_type << " b) {\n";
-    indent_size++;
-
-    handle(new BinaryOp(op_type, a, a, b));
-    indent_size--;
-    indent();
-    os << "}\n";
-  }
-}
-
-void IRPrinter::printKernel(
-    const std::vector<Expr*>& exprs,
-    const std::string& kernel_name,
-    const std::vector<Val*>& global_buffers) {
-  Fusion* fusion = FusionGuard::getCurFusion();
-  if (exprs.empty())
-    return;
-  TORCH_INTERNAL_ASSERT(
-      exprs[0]->fusion() == FusionGuard::getCurFusion(),
-      "Incorrect fusion set during printKernel.");
-
-  printReductionOps(fusion);
-  printHeader(fusion, kernel_name, global_buffers);
-
-  for (auto* expr : exprs) {
-    handle(expr);
-  }
-  os << "}\n";
-}
-
-const ThreadPredicateMap& IRPrinter::getThreadPredicateMap() {
-  if (thread_predicates_ == nullptr) {
-    Fusion* fusion = FusionGuard::getCurFusion();
-    thread_predicates_ = std::make_unique<ThreadPredicateMap>(fusion);
-  }
-  return *thread_predicates_;
+  os_ << "\n";
 }
 
 std::ostream& operator<<(std::ostream& os, const Statement* stmt) {
-  IRPrinter p(os);
+  IrPrinter p(os);
   p.handle(stmt);
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, Fusion* f) {
-  IRPrinter p(os);
+  IrPrinter p(os);
   FusionGuard guard(f);
   p.handle(f);
   return os;
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h
index e6d4b473a758..01e8bdaa09dc 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.h
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h
@@ -1,9 +1,9 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 #include <iostream>
 
@@ -11,92 +11,29 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-class Fusion;
-
-// Hierarchal dispatch functions for handle
-class Statement;
-class Expr;
-class Val;
-
-// Vals
-class IterDomain;
-class TensorDomain;
-class TensorView;
-class Bool;
-class Float;
-class Half;
-class Int;
-class NamedScalar;
-
-// Exprs
-class Split;
-class Merge;
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class ReductionOp;
-class BroadcastOp;
-
-// Kernel IR
-namespace kir {
-
-class Bool;
-class Float;
-class Half;
-class Int;
-class NamedScalar;
-
-class IterDomain;
-class TensorDomain;
-class TensorView;
-
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class ReductionOp;
-class BroadcastOp;
-
-class TensorIndex;
-class Allocate;
-class ForLoop;
-class IfThenElse;
-class GridReduction;
-class Sync;
-
-} // namespace kir
-
-/*
- * Define pretty printing functions for all nodes. handle is used so we can take
- * advantage of OptInConstDispatch. Where we will throw an error if a print
- * function is not defined for a node. Stream operator << is also provided for
- * Fusion&, Fusion* and Statement* which allow us to print any node through
- * stream operator <<.
- */
-
-class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
+//! Define pretty printing functions for IR nodes
+//!
+//! This class is intended for debug printing, so it attempts
+//! to handle invalid states as well.
+//!
+class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
  public:
-  std::ostream& os;
-  bool print_inline_ = false;
-
-  // Track the indentation size for pretty printing
-  int indent_size = 0;
+  explicit IrPrinter(std::ostream& os) : os_(os) {}
 
   // Indent the generated code
   void indent() {
-    for (int i = 0; i < indent_size; i++)
-      os << "  ";
+    for (int i = 0; i < indent_size_; i++) {
+      os_ << "  ";
+    }
   }
 
   void resetIndent() {
-    indent_size = 0;
+    indent_size_ = 0;
   }
 
-  void printHeader(
-      Fusion* fusion,
-      const std::string& kernel_name_,
-      const std::vector<Val*>& global_buffers);
-
-  IRPrinter(std::ostream& _os) : os(_os) {}
+  bool printInline() const {
+    return print_inline_;
+  }
 
   virtual void handle(Fusion* f);
 
@@ -118,7 +55,6 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
   void handle(const TensorDomain*) override;
   void handle(const TensorView*) override;
   void handle(const IterDomain*) override;
-  void handle(const kir::TensorIndex*) override;
 
   void handle(const Bool*) override;
   void handle(const Float*) override;
@@ -138,6 +74,7 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
   void handle(const kir::Int*) override;
   void handle(const kir::NamedScalar*) override;
 
+  void handle(const kir::TensorIndex*) override;
   void handle(const kir::IterDomain*) override;
   void handle(const kir::TensorDomain*) override;
   void handle(const kir::TensorView*) override;
@@ -164,17 +101,10 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
     print_inline_ = prev;
   }
 
-  void printReductionOps(Fusion* fusion);
-
-  void printKernel(
-      const std::vector<Expr*>& exprs,
-      const std::string& kernel_name,
-      const std::vector<Val*>& global_buffers);
-
  private:
-  std::unique_ptr<ThreadPredicateMap> thread_predicates_;
-
-  const ThreadPredicateMap& getThreadPredicateMap();
+  std::ostream& os_;
+  bool print_inline_ = false;
+  int indent_size_ = 0;
 };
 
 TORCH_CUDA_API std::ostream& operator<<(
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
index d63d2ce68183..2e1e34de6871 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
@@ -2,12 +2,11 @@
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-
 #include <sstream>
 
 namespace torch {
@@ -321,13 +320,27 @@ IterDomain::IterDomain(
 
   TORCH_INTERNAL_ASSERT(
       _extent->isAnInt(),
-      "Cannot create an iter domain over an extent that is not an int but recieved ",
+      "Cannot create an iter domain over an extent that is not an int but received ",
       _extent,
       " .");
 
   TORCH_INTERNAL_ASSERT(
       _start->isAnInt(),
-      "Cannot create an iter domain with a start that is not an int but recieved ",
+      "Cannot create an iter domain with a start that is not an int but received ",
+      _extent,
+      " .");
+
+  // Check that all for-loops iterate from zero to some positive integer
+  // lower_insert_syncs uses this assumption for correctness.
+  TORCH_INTERNAL_ASSERT(
+      _start->isZeroInt(),
+      "Cannot create an iter domain with a start that is non-zero but received ",
+      _extent,
+      " .");
+
+  TORCH_INTERNAL_ASSERT(
+      !_extent->isZeroInt(),
+      "Cannot create an iter domain with a extent that is zero but received ",
       _extent,
       " .");
 
@@ -571,6 +584,15 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
       rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)),
       contiguity_(src->contiguity()) {}
 
+bool TensorDomain::operator==(const TensorDomain& other) const {
+  // Checks equality of each class field. Should not be necessary to
+  // check no_bcast_domain_ and no_reduction_domain_ as they are just
+  // derived from domain_.
+  return root_domain_ == other.root_domain_ && domain_ == other.domain_ &&
+      rfactor_domain_ == other.rfactor_domain_ &&
+      contiguity_ == other.contiguity_;
+}
+
 bool TensorDomain::sameAs(const TensorDomain* const other) const {
   if (nDims() != other->nDims())
     return false;
@@ -623,6 +645,12 @@ bool TensorDomain::hasGridReduction() const {
   });
 }
 
+bool TensorDomain::hasBlockBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isThreadDim();
+  });
+}
+
 bool TensorDomain::hasBroadcast() const {
   return no_bcast_domain_.size() != domain_.size();
 }
@@ -1002,6 +1030,347 @@ std::pair<TensorDomain*, TensorDomain*> TensorDomain::rFactor(
       TransformRFactor::runReplay2(this, axes)};
 }
 
+namespace {
+
+//! Container class DisjointSet models equivalence relationships
+//!
+//! Each instance of this class keeps a set of equivalent classes
+//! DisjointSet::join(a,b) makes the full class of a and b equivalent
+//! DisjointSet::areEqual(a,b) checks if a and b belong same class
+//!
+//! \note The template type T is assumed to be hashable
+template <typename T>
+class DisjointSet {
+ public:
+  DisjointSet() = default;
+
+  //! Joins the equivalent class that a and b belong to
+  //! areEqual(a',b') will be true for each a'=a and b'=b
+  //!
+  //! \param a An element from a equivalent class
+  //!          will create a new equivalent class if a does
+  //!          not belong to any
+  //! \param b An element from another equivalent class
+  //!          will create a new equivalent class if b does
+  //!          not belong to any
+  void join(T a, T b) {
+    // cases where either of the quiv class doesn't exist
+    if (!entry_map.count(a) && !entry_map.count(b)) {
+      createPoint(a);
+      entry_map[b] = fixedPoint(a);
+    } else if (!entry_map.count(a)) {
+      entry_map[a] = fixedPoint(b);
+    } else if (!entry_map.count(b)) {
+      entry_map[b] = fixedPoint(a);
+    } else {
+      // case where both equiv classes exist and need to join
+      const int i0 = fixedPoint(a);
+      const int i1 = fixedPoint(b);
+      int new_parent = 0;
+      int new_child = 0;
+
+      // Either order here is correct but joining larger class to smaller class
+      // tend to be faster
+      std::tie(new_parent, new_child) = (weights[i0] < weights[i1])
+          ? std::make_pair(i0, i1)
+          : std::make_pair(i1, i0);
+      weights[new_parent] += weights[new_child];
+      set_map[new_child] = new_parent;
+    }
+  }
+
+  //! Checks if a and b belong to the same equivalent class
+  //!
+  //! \param a An element from a equivalent class
+  //! \param b An element from another equivalent class
+  //! \returns Boolean value representing if a and b are
+  //!          recorded to be in the same equivalent class
+  //!          will return false if any of a or b doesn't
+  //!          have an equivalent class recorded
+  bool areEquivalent(T a, T b) const {
+    if (!entry_map.count(a) || !entry_map.count(b)) {
+      return false;
+    }
+    return fixedPoint(a) == fixedPoint(b);
+  }
+
+ private:
+  // Internal fixed point implementation:
+  //  Returns the equivalent class that e belongs to
+  int fixedPoint(int e) const {
+    TORCH_INTERNAL_ASSERT(static_cast<int>(set_map.size()) > e);
+    while (set_map[e] != e) {
+      // Chasing to fixed point
+      e = set_map[e];
+    }
+    return e;
+  }
+
+  //! Utility to check the class i belongs to:
+  //!
+  //! Will create a new class if no match seen
+  //! \param e element e to find the equiv class for
+  //! \returns the equivalent class that e belongs to
+  //!
+  int fixedPoint(T e) const {
+    // Handles case when i doesn't have an equivalence class
+    TORCH_INTERNAL_ASSERT(entry_map.count(e));
+
+    // Use fixed point as a representation for the equiv class
+    return fixedPoint(entry_map.at(e));
+  }
+
+  //! Utility to create a new equiv class for i
+  //
+  //! \param i Element i to create the equiv class for
+  void createPoint(T i) {
+    entry_map[i] = next_index_;
+    set_map.push_back(next_index_++);
+    weights.push_back(1);
+  }
+
+ private:
+  // Internal representation of the equivalence class as integers
+  // set_map implements the "parent" relationship
+  std::vector<int> set_map;
+  // Weights is used for preliminary perf optimization
+  std::vector<int> weights;
+
+  // Map the input of type T to its equivalence class
+  std::unordered_map<T, int> entry_map;
+
+  // Running counter for generating new index when
+  // Creating new equiv classes
+  int next_index_ = 0;
+};
+
+//! Concretize broadcast axes, i.e. identifying a non-broadcast
+//! IterDomain that the broadcast IterDomain can map to.
+//!
+//! This traversal processes root domains only, concretization works by
+//! inspecting pointwise ops, e.g. : T2 [i0,i1] = T1[i0,B0] + T0[i0,i1]
+//! will concretize axis B0 to i1
+//!
+class ConcretizeDomain : private BackwardVisitor {
+ public:
+  //! Traverses the graph backward from outputs
+  //! to identify all concretizing opportunities
+  //!
+  explicit ConcretizeDomain(Fusion* fusion) {
+    traverseFrom(fusion, fusion->outputs(), false);
+  }
+
+  //! API call to run the concretize pass and return the
+  //! axis that bcast_dom concretizes to
+  //!
+  static const IterDomain* getConcreteDomain(IterDomain* bcast_dom) {
+    ConcretizeDomain cd(bcast_dom->fusion());
+
+    // Remove this assertion once we support broadcast on output
+    TORCH_INTERNAL_ASSERT(cd.canConcretize(bcast_dom));
+    return cd.concretized(bcast_dom);
+  }
+
+  // Returns true if either id is not a broadcast or
+  // the traversal has found a concretized axis for id
+  bool canConcretize(IterDomain* id) const {
+    return !id->isBroadcast() || bcast_domain_map_.count(id);
+  }
+
+  // Returns the concretized id recorded from traversal
+  IterDomain* concretized(IterDomain* id) const {
+    TORCH_INTERNAL_ASSERT(canConcretize(id));
+    if (!id->isBroadcast()) {
+      return id;
+    }
+    return bcast_domain_map_.at(id);
+  }
+
+ private:
+  // Utility to inspect a pointwise operator and
+  // record concretize opportunities
+  void concretizePwOp(Expr* e);
+
+  // Utility to record new concretize opportunity
+  void concretizeTo(IterDomain* id, IterDomain* To) {
+    TORCH_INTERNAL_ASSERT(id->isBroadcast() && !To->isBroadcast());
+    bcast_domain_map_[id] = concretized(To);
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  void handle(ReductionOp* rop) override {
+    concretizePwOp(rop);
+  }
+
+  void handle(UnaryOp* uop) override {
+    concretizePwOp(uop);
+  }
+
+  void handle(BinaryOp* bop) override {
+    concretizePwOp(bop);
+  }
+
+  void handle(TernaryOp* top) override {
+    concretizePwOp(top);
+  };
+#pragma clang diagnostic pop
+
+ private:
+  using MapType = std::unordered_map<IterDomain*, IterDomain*>;
+  MapType bcast_domain_map_;
+};
+
+void ConcretizeDomain::concretizePwOp(Expr* e) {
+  TensorView* tv = *ir_utils::filterByType<TensorView>(e->outputs()).begin();
+
+  std::vector<IterDomain*> io = tv->getRootDomain();
+
+  for (auto* i : ir_utils::filterByType<TensorView>(e->inputs())) {
+    std::vector<IterDomain*> ii =
+        TensorDomain::noReductions(i->getMaybeRFactorDomain());
+    TORCH_INTERNAL_ASSERT(ii.size() == io.size());
+
+    for (size_t it = 0; it < ii.size(); it++) {
+      if (!canConcretize(io[it]))
+        continue;
+
+      if (!canConcretize(ii[it]))
+        concretizeTo(ii[it], concretized(io[it]));
+    }
+  }
+}
+
+//! Models equivalence provable by the graph
+//!
+//! This traversal processes root domains only,
+//! equalities , e.g. :
+//!    T2 [i0,i1] = T1[i2,i3] + T0[i4,i5]
+//! will prove that i2 and i4 are equal in the sense that
+//!    i2.start = i4.start, i2.extent = i4.extent
+//! Depends on ConcretizeDomain, and equalities involving
+//! broadcast domains are defined based on the concretized version
+class ProveValEqual : private IterVisitor {
+ public:
+  explicit ProveValEqual(Fusion* fusion) : cd_(fusion) {
+    traverseFrom(fusion, fusion->outputs(), false);
+  }
+
+  //! Checks if two scalars are equal
+  //!
+  //! First checks if ScalarCheck has them equal,
+  //! next try to prove them equal from
+  //! the graph_traversal result
+  //!
+  //! \param a A symbolic value
+  //! \param b Another value from the same fusion
+  //! \returns Boolean representing if they are proven to be
+  //!          equal based on scalar check and graph traversal
+  bool areEqual(Val* a, Val* b) const {
+    if (ScalarCheck::sameAs(a, b)) {
+      return true;
+    }
+    if (eq_set_.areEquivalent(a, b)) {
+      return true;
+    }
+    return false;
+  }
+
+  //! Checks if two iterdomains are equal
+  //!
+  //! Equality defined as equal start and equal extent
+  //! true means a and b are equal
+  //! false only means that they cannot be proven equal based
+  //! on scalar check and graph traversal
+  //!
+  //! \param a An iterdomain
+  //! \param b Another iterdomain from the same fusion
+  //! \returns Boolean representing if they are proven to be
+  //!          equivalent in the sense that they have equal
+  //!          start and extent
+  bool areEquivalent(IterDomain* a, IterDomain* b) const {
+    if (a->sameAs(b)) {
+      return true;
+    }
+
+    // Abort on un-concretized domains, this can appear once we
+    // allow broadcast on fusion output
+    if (!cd_.canConcretize(a) || !cd_.canConcretize(b)) {
+      return false;
+    }
+
+    auto ac = cd_.concretized(a);
+    auto bc = cd_.concretized(b);
+    return areEqual(ac->start(), bc->start()) &&
+        areEqual(ac->rawExtent(), bc->rawExtent());
+  }
+
+ private:
+  // Utility class to record new equality found
+  void proveId(IterDomain* a, IterDomain* b) {
+    if (!a->sameAs(b)) {
+      eq_set_.join(a->start(), b->start());
+      eq_set_.join(a->rawExtent(), b->rawExtent());
+    }
+  }
+
+  // Inspect a pointwise op and record the identified equality
+  void provePwOp(Expr* e) {
+    TensorView* tv = *ir_utils::filterByType<TensorView>(e->outputs()).begin();
+    std::vector<IterDomain*> io = tv->getRootDomain();
+
+    // Record equalities from output to all the inputs
+    // ignores un-concretizable broadcasts
+    for (auto* i : ir_utils::filterByType<TensorView>(e->inputs())) {
+      std::vector<IterDomain*> ii =
+          TensorDomain::noReductions(i->getMaybeRFactorDomain());
+
+      for (size_t it = 0; it < ii.size(); it++)
+        if (cd_.canConcretize(ii[it]) && cd_.canConcretize(io[it]))
+          proveId(cd_.concretized(ii[it]), cd_.concretized(io[it]));
+    }
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  void handle(ReductionOp* rop) override {
+    provePwOp(rop);
+  }
+
+  void handle(UnaryOp* uop) override {
+    provePwOp(uop);
+  }
+
+  void handle(BinaryOp* bop) override {
+    provePwOp(bop);
+  }
+
+  void handle(TernaryOp* top) override {
+    provePwOp(top);
+  }
+#pragma clang diagnostic pop
+
+ private:
+  ConcretizeDomain cd_;
+  DisjointSet<const Val*> eq_set_;
+};
+
+} // namespace
+
+// API call to return the concretized axis of a broadcast axis
+const IterDomain* IterDomain::concretizeDomain(IterDomain* bcast_dom) {
+  return ConcretizeDomain::getConcreteDomain(bcast_dom);
+}
+
+// API call to check if two IterDomains are equal
+// checks start and extent, contains both scalar check and graph traversal
+// broadcast domains are concretized before comparing
+bool IterDomain::proveEquivalent(IterDomain* a, IterDomain* b) {
+  TORCH_INTERNAL_ASSERT(a->fusion() == b->fusion());
+  ProveValEqual pve(a->fusion());
+  return pve.areEquivalent(a, b);
+}
+
 Split::Split(
     IterDomain* _outer,
     IterDomain* _inner,
diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/torch/csrc/jit/codegen/cuda/ir_printer.h
index 84f3a2a188ad..57ca00076afc 100644
--- a/torch/csrc/jit/codegen/cuda/ir_printer.h
+++ b/torch/csrc/jit/codegen/cuda/ir_printer.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -6,51 +7,58 @@
 
 #include <iostream>
 
-/*
- * IRMathPrinter and IRTransformPrinter allow the splitting up of fusion print
- * functions. IRMathPrinter as its name implies focuses solely on what tensor
- * computations are taking place. Resulting TensorView math will reflect the
- * series of split/merge/computeAts that have taken place, however these
- * nodes will not be displayed in what is printed. IRTransformPrinter does not
- * print any mathematical functions and only lists the series of
- * split/merge calls that were made. Both of these printing methods are
- * quite verbose on purpose as to show accurately what is represented in the IR
- * of a fusion.
- */
-
 namespace torch {
 namespace jit {
 namespace fuser {
 
-class TORCH_CUDA_API IRMathPrinter : public IRPrinter {
+//! Prints computation Fusion IR nodes
+//!
+//! IrMathPrinter and IrTransformPrinter allow the splitting up of fusion print
+//! functions. IrMathPrinter as its name implies focuses solely on what tensor
+//! computations are taking place. Resulting TensorView math will reflect the
+//! series of split/merge/computeAts that have taken place, however these
+//! nodes will not be displayed in what is printed. IrTransformPrinter does not
+//! print any mathematical functions and only lists the series of
+//! split/merge calls that were made. Both of these printing methods are
+//! quite verbose on purpose as to show accurately what is represented in the IR
+//! of a fusion.
+//
+//! \sa IrTransformPrinter
+//!
+class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
  public:
-  IRMathPrinter(std::ostream& os) : IRPrinter(os) {}
+  IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
 
   void handle(const Split* const) override {}
   void handle(const Merge* const) override {}
 
   void handle(Fusion* f) override {
-    IRPrinter::handle(f);
+    IrPrinter::handle(f);
   }
 };
 
-class TORCH_CUDA_API IRTransformPrinter : public IRPrinter {
+//! Prints transformation (schedule) Fusion IR nodes
+//!
+//! \sa IrMathPrinter
+//!
+class TORCH_CUDA_API IrTransformPrinter : public IrPrinter {
  public:
-  IRTransformPrinter(std::ostream& os) : IRPrinter(os) {}
+  IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
 
-  // Tensor Expressions
   void handle(const UnaryOp* const uop) override {
-    if (print_inline_)
-      IRPrinter::handle(uop);
+    if (printInline()) {
+      IrPrinter::handle(uop);
+    }
   }
 
   void handle(const BinaryOp* const bop) override {
-    if (print_inline_)
-      IRPrinter::handle(bop);
+    if (printInline()) {
+      IrPrinter::handle(bop);
+    }
   }
 
   void handle(Fusion* f) override {
-    IRPrinter::handle(f);
+    IrPrinter::handle(f);
   }
 };
 
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
index 198643414a09..1a846fa96a72 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
@@ -10,32 +10,6 @@ namespace fuser {
 
 /* ITER VISITOR */
 
-std::vector<Statement*> IterVisitor::next(Statement* stmt) {
-  if (stmt->isVal()) {
-    return next(stmt->as<Val>());
-  } else if (stmt->isExpr()) {
-    return next(stmt->as<Expr>());
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "IterVisitor could not detect type in next_dispatch.");
-  }
-}
-
-std::vector<Statement*> IterVisitor::next(Val* v) {
-  FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, ");
-  if (FusionGuard::getCurFusion()->origin(v) != nullptr) {
-    return {FusionGuard::getCurFusion()->origin(v)};
-  }
-  return {};
-}
-
-std::vector<Statement*> IterVisitor::next(Expr* expr) {
-  FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, ");
-  std::vector<Statement*> next_stmts{expr->inputs().begin(),
-                                     expr->inputs().end()};
-  return next_stmts;
-}
-
 namespace {
 
 // Remove any stmt in stmts that is in visited
@@ -349,7 +323,7 @@ namespace {
 // them.
 struct Dependencies : public IterVisitor {
   std::unordered_set<Val*> dependencies_;
-  std::unordered_set<Val*> vals;
+  std::unordered_set<Val*> vals_;
 
   std::vector<Statement*> next(Val* v) override {
     if (dependencies_.find(v) != dependencies_.end())
@@ -358,7 +332,7 @@ struct Dependencies : public IterVisitor {
   }
 
   void handle(Val* val) override {
-    vals.emplace(val);
+    vals_.emplace(val);
   }
 
   Dependencies(
@@ -377,11 +351,43 @@ struct Dependencies : public IterVisitor {
     }
 
     Dependencies deps(dependencies, of);
-    return deps.vals;
+    return deps.vals_;
+  }
+};
+
+// Looks for and returns all output values with dependencies on `of`.
+struct FindOutputs : public IterVisitor {
+  const std::unordered_set<Val*>& of_;
+  std::unordered_set<Val*> outs_;
+
+  void handle(Val* val) override {
+    if (of_.find(val) != of_.end()) {
+      Statement* out_stmt = stmt_stack.front().back();
+      if (out_stmt->isVal()) {
+        auto out_val = out_stmt->as<Val>();
+        if (of_.find(out_val) == of_.end()) {
+          outs_.emplace(out_val);
+        }
+      }
+    }
+  }
+
+  FindOutputs(const std::unordered_set<Val*>& _of) : of_(_of) {
+    auto fusion = (*of_.begin())->fusion();
+    traverseFrom(fusion, fusion->outputs(), false);
+  };
+
+  static std::unordered_set<Val*> getAllOutputsOf(
+      const std::unordered_set<Val*>& of) {
+    if (of.empty()) {
+      return std::unordered_set<Val*>();
+    }
+
+    FindOutputs finder(of);
+    return finder.outs_;
   }
 };
 
-// Looks for and returns
 class DependencyChains : public IterVisitor {
  public:
   std::deque<std::deque<Val*>> dep_chains;
@@ -496,6 +502,44 @@ std::unordered_set<Val*> DependencyCheck::getAllValsBetween(
   return Dependencies::getAllVals(dependencies, of);
 }
 
+std::unordered_set<Val*> DependencyCheck::getAllOutputsOf(
+    const std::unordered_set<Val*>& of) {
+  if (of.empty()) {
+    return std::unordered_set<Val*>();
+  }
+  FusionGuard fg((*of.begin())->fusion());
+  return FindOutputs::getAllOutputsOf(of);
+}
+
+void ExprSort::handle(Expr* expr) {
+  exprs.push_back(expr);
+}
+
+std::vector<Expr*> ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) {
+  ExprSort es;
+  es.traverse(fusion, from_outputs_only);
+  return es.exprs;
+}
+
+std::vector<Expr*> ExprSort::getExprs(
+    Fusion* fusion,
+    const std::vector<Val*>& from) {
+  ExprSort es;
+  es.traverseFrom(fusion, from, false);
+  return es.exprs;
+}
+
+void InputsOf::handle(Val* v) {
+  if (FusionGuard::getCurFusion()->origin(v) == nullptr)
+    inputs.emplace(v);
+}
+
+std::unordered_set<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
+  InputsOf io;
+  io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false);
+  return io.inputs;
+}
+
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h
index ec08df28a89f..cf01e903f3a1 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.h
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h
@@ -4,6 +4,11 @@
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+
 #include <deque>
 #include <unordered_set>
 #include <vector>
@@ -12,14 +17,6 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-class Statement;
-class Val;
-class Expr;
-
-class Fusion;
-
-enum class ValType;
-
 /*
  * IterVisitor starts from leaf nodes, fusion outputs, or the provided values.
  * It walks the DAG bacwkards from the starting nodes, to roots. Each node in
@@ -49,9 +46,31 @@ class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
   // These functions will start at outputs and propagate up through the DAG
   // to inputs based on depth first traversal. Next could be called on a node
   // multiple times.
-  virtual std::vector<Statement*> next(Statement* stmt);
-  virtual std::vector<Statement*> next(Expr* expr);
-  virtual std::vector<Statement*> next(Val* v);
+  virtual std::vector<Statement*> next(Statement* stmt) {
+    if (stmt->isVal()) {
+      return next(stmt->as<Val>());
+    } else if (stmt->isExpr()) {
+      return next(stmt->as<Expr>());
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false, "IterVisitor could not detect type in next_dispatch.");
+    }
+  }
+
+  virtual std::vector<Statement*> next(Val* v) {
+    FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, ");
+    if (FusionGuard::getCurFusion()->origin(v) != nullptr) {
+      return {FusionGuard::getCurFusion()->origin(v)};
+    }
+    return {};
+  }
+
+  virtual std::vector<Statement*> next(Expr* expr) {
+    FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, ");
+    std::vector<Statement*> next_stmts{expr->inputs().begin(),
+                                       expr->inputs().end()};
+    return next_stmts;
+  }
 
   // This handle functions is called on every Statement* in topological order,
   // starting from outputs to inputs.
@@ -212,6 +231,36 @@ class TORCH_CUDA_API DependencyCheck {
   static std::unordered_set<Val*> getAllValsBetween(
       const std::unordered_set<Val*>& dependencies,
       const std::vector<Val*>& of);
+
+  // Return registered outputs of the fusion that are a dependency of any val of
+  static std::unordered_set<Val*> getAllOutputsOf(
+      const std::unordered_set<Val*>& of);
+};
+
+// Expr sort will take a fusion and return a topologically sorted list of
+// expressions.
+class ExprSort : public IterVisitor {
+ private:
+  std::vector<Expr*> exprs;
+
+  void handle(Expr* expr) override;
+
+ public:
+  static std::vector<Expr*> getExprs(Fusion* fusion, bool from_outputs_only);
+
+  static std::vector<Expr*> getExprs(
+      Fusion* fusion,
+      const std::vector<Val*>& from);
+};
+
+class InputsOf : public IterVisitor {
+ private:
+  std::unordered_set<Val*> inputs;
+
+  void handle(Val* v) final;
+
+ public:
+  static std::unordered_set<Val*> output(Fusion* fusion, Val* output_);
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
index 284bcffda7fb..c6c0a39ccb79 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -1,11 +1,156 @@
 
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+
+#include <unordered_set>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
-void Kernel::print() const {}
+namespace {
+
+//! Scan all primary expressions in the Kernel IR and build
+//! list of specialized nodes
+//!
+//! \note primary expressions are expressions which are not subexpressions
+//!   in a larger expression (things like ForLoop or IfThenElse are not
+//!   real expressions)
+//!
+class KernelIrScanner : private OptOutDispatch {
+ public:
+  // Use expression count to uniquely identify each expression
+  size_t all_expression_count = 0;
+
+  // Map expression id to war hazard sync
+  std::unordered_map<size_t, kir::Sync*> war_hazard_syncs;
+
+  std::vector<kir::Allocate*> global_allocations;
+  std::vector<kir::Allocate*> dynamic_allocations;
+  std::vector<kir::Allocate*> static_allocations;
+  std::unordered_set<Expr*> primary_expressions;
+
+ public:
+  explicit KernelIrScanner(const std::vector<Expr*>& exprs) {
+    TORCH_INTERNAL_ASSERT(!exprs.empty());
+    for (auto expr : exprs) {
+      handle(expr);
+    }
+  }
+
+ private:
+  void handle(Expr* expr) final {
+    TORCH_CHECK(primary_expressions.insert(expr).second);
+    ++all_expression_count;
+    OptOutDispatch::handle(expr);
+  }
+
+  void handle(kir::Sync* sync) final {
+    // TODO: Move to a dedicated validation pass
+    // which is not on the common execution/compilation path
+    if (sync->isWarHazardSync()) {
+      war_hazard_syncs[all_expression_count] = sync;
+    }
+  }
+
+  void handle(kir::ForLoop* fl) final {
+    for (auto expr : fl->body().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::IfThenElse* ite) final {
+    for (auto expr : ite->thenBody().exprs()) {
+      handle(expr);
+    }
+    for (auto expr : ite->elseBody().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::Allocate* a) final {
+    switch (a->getMemoryType()) {
+      case MemoryType::Global:
+        global_allocations.push_back(a);
+        break;
+      case MemoryType::Shared:
+        if (a->size()->isConstScalar()) {
+          static_allocations.push_back(a);
+        } else {
+          dynamic_allocations.push_back(a);
+        }
+        break;
+      case MemoryType::Local:
+        break;
+    }
+  }
+};
+
+} // namespace
+
+// TODO(kir): Kernel IR validation
+void Kernel::finalize(
+    std::vector<Expr*> top_level_exprs,
+    ThreadPredicateMap predicate_map) {
+  TORCH_CHECK(top_level_exprs_.empty());
+  TORCH_CHECK(!predicate_map_);
+  top_level_exprs_ = std::move(top_level_exprs);
+  predicate_map_ =
+      std::make_unique<ThreadPredicateMap>(std::move(predicate_map));
+  analyze();
+}
+
+void Kernel::analyze() {
+  FUSER_PERF_SCOPE("Kernel::analyze");
+
+  const KernelIrScanner ir_scanner(top_level_exprs_);
+
+  // Cache the list of buffers used within the kernel
+  summary_.war_hazard_syncs = ir_scanner.war_hazard_syncs;
+  summary_.global_allocations = ir_scanner.global_allocations;
+  summary_.dynamic_smem_allocations = ir_scanner.dynamic_allocations;
+  summary_.static_smem_allocations = ir_scanner.static_allocations;
+
+  // Figure out if the kernel uses random numbers
+  for (auto expr : ir_scanner.primary_expressions) {
+    if (expr->getExprType() == ExprType::KirUnaryOp) {
+      if (expr->as<kir::UnaryOp>()->getUnaryOpType() == UnaryOpType::RandLike) {
+        summary_.is_stochastic = true;
+        break;
+      }
+    }
+  }
+
+  // Look for reductions and shared memory buffers
+  size_t max_smem_type_size = 0;
+  for (auto expr : ir_scanner.primary_expressions) {
+    for (auto out : expr->outputs()) {
+      if (out->getValType() == ValType::TensorIndex) {
+        const auto tv = out->as<kir::TensorIndex>()->view();
+        const auto domain = tv->domain();
+
+        // Do we have any reductions?
+        summary_.has_block_reductions |= domain->hasBlockReduction();
+        summary_.has_grid_reductions |= domain->hasGridReduction();
+
+        // Do we have block broadcasts?
+        summary_.has_block_broadcasts |= domain->hasBlockBroadcast();
+
+        // Update the largest smem data type
+        if (domain->hasBlockReduction() || domain->hasGridReduction() ||
+            tv->memoryType() == MemoryType::Shared) {
+          const auto data_type = tv->getDataType().value();
+          const size_t type_size = dataTypeSize(data_type);
+          if (type_size > max_smem_type_size) {
+            max_smem_type_size = type_size;
+            summary_.largest_smem_data_type = data_type;
+          }
+        }
+      }
+    }
+  }
+}
 
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h
index 73774e6f85fb..1d7b1834c39f 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.h
+++ b/torch/csrc/jit/codegen/cuda/kernel.h
@@ -3,21 +3,132 @@
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
-class TORCH_CUDA_API Kernel final {
+//! Summary of interesting facts about the kernel
+//!
+//! TODO(kir): const node ptrs
+//!
+struct KernelSummary {
+  //! List of Write-After-Read (WAR) synchronization barriers
+  std::unordered_map<size_t, kir::Sync*> war_hazard_syncs;
+
+  //! List of global buffers
+  std::vector<kir::Allocate*> global_allocations;
+
+  //! List of dynamic shared memory buffers
+  std::vector<kir::Allocate*> dynamic_smem_allocations;
+
+  //! List of static shared memory buffers
+  std::vector<kir::Allocate*> static_smem_allocations;
+
+  //! Indicate the need to generate random numbers
+  bool is_stochastic = false;
+
+  //! Do we have any block reductions?
+  bool has_block_reductions = false;
+
+  //! Do we have any grid reductions?
+  bool has_grid_reductions = false;
+
+  //! Do we have any block broadcasts?
+  bool has_block_broadcasts = false;
+
+  //! Largest shared memory buffer base type
+  DataType largest_smem_data_type = DataType::Null;
+};
+
+//! Container for a lowered Kernel IR
+//!
+//! TODO(kir): currently, it is just pointing to nodes owned
+//!  by a Fusion object. The goal is to have the Kernel object
+//!  own the Kernel IR nodes
+//!
+class TORCH_CUDA_API Kernel final : public NonCopyable {
  public:
-  void print() const;
+  Kernel() = default;
+
+  //! Finalize a kernel definition
+  //!
+  //! At this point we have a complete kernel definition and we can
+  //! run analysis passes to build a KernelSummary
+  //!
+  void finalize(
+      std::vector<Expr*> top_level_exprs,
+      ThreadPredicateMap predicate_map);
+
+  //! Register input as an input of the kernel
+  void addInput(Val* input) {
+    inputs_.push_back(input);
+  }
+
+  //! Register output as an output of the kernel
+  void addOutput(Val* output) {
+    outputs_.push_back(output);
+  }
+
+  const auto& inputs() const {
+    return inputs_;
+  }
+
+  const auto& outputs() const {
+    return outputs_;
+  }
+
+  const auto& topLevelExprs() const {
+    return top_level_exprs_;
+  }
+
+  const KernelSummary& summary() const {
+    return summary_;
+  }
+
+  const ThreadPredicateMap& predicateMap() const {
+    return *predicate_map_;
+  }
+
+  //! Register a new Kernel IR node
+  //!
+  //! \note This is a specialized helper for kir::IrBuilder, not
+  //!   intendted for general use
+  //!
+  void registerIrNode(std::unique_ptr<Statement> node) {
+    ir_nodes_.push_back(std::move(node));
+  }
 
  private:
-  // Lowered IR
-  std::unordered_set<Val*> lowered_val_set_;
-  std::unordered_set<Expr*> lowered_expr_set_;
+  // Analyze the kernel IR and caches the summary of interesting data
+  void analyze();
+
+ private:
+  // Kernel IR nodes
+  std::vector<std::unique_ptr<Statement>> ir_nodes_;
+
+  // Map from value to its definition expression
+  std::unordered_map<const Val*, Expr*> definitions_;
+
+  // Top level expressions
+  std::vector<Expr*> top_level_exprs_;
+
+  // Kernel inputs and outputs
+  std::vector<Val*> inputs_;
+  std::vector<Val*> outputs_;
+
+  // Summary of interesting kernel data
+  KernelSummary summary_;
+
+  // Predicate map
+  // TODO(kir): consider a simpler, kernel IR based version
+  std::unique_ptr<ThreadPredicateMap> predicate_map_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index 6277a8103c79..e8300970eb59 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -1,11 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 
-// TODO: This class is dead at the moment, but we need to figure out a generic
-// cacheing system that will suite our needs.
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -18,6 +18,8 @@ std::vector<size_t> toVector(const at::DimVector& small_vec) {
   return std::vector<size_t>(small_vec.begin(), small_vec.end());
 }
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
 void debugPrint(const TensorTypePtr& type) {
   printf("\nsizes:");
   if (auto sizes = type->symbolic_sizes().sizes()) {
@@ -67,18 +69,15 @@ void debugPrint(const TensorTypePtr& type) {
     printf("no stride properties available\n");
   }
 }
+#pragma clang diagnostic pop
 
 at::DimVector graphReductionAxes(const std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("graphReductionAxes");
+
   at::DimVector reduction_axes;
+  // TODO: let check that we have only single reduction node in the graph.
   for (const auto& n : graph->nodes()) {
     if (isReductionNode(n)) {
-      // TODO: I think this is enough to detect reduction that's not the output
-      // as well. Since we go in topological order, we would run into
-      // intermediate reduction, if there's any.
-      TORCH_INTERNAL_ASSERT(
-          graph->outputs().size() == 1 && graph->outputs()[0] == n->output(),
-          "support for graph with reduction is limited to single output from reduction node");
-
       // TODO: we should return empty when `keepdim` is True?
       auto dims_list = constant_as<c10::List<int64_t>>(n->input(1));
       TORCH_INTERNAL_ASSERT(
@@ -97,6 +96,8 @@ at::DimVector graphReductionAxes(const std::shared_ptr<Graph>& graph) {
 }
 
 at::DimVector getPermutationPerSortedStride(const TensorTypePtr& type) {
+  FUSER_PERF_SCOPE("getPermutationPerSortedStride");
+
   // `permute_seq` is the returned permutation to achieve sorted stride;
   at::DimVector permute_seq;
 
@@ -157,9 +158,9 @@ at::DimVector inversePermutation(
     for (const auto& dim : permuted) {
       int adjusted_offset = 0;
       for (const auto& red_dim : reduction_axes) {
-        if (red_dim < (const unsigned long)dim) {
+        if (red_dim < (unsigned long)dim) {
           adjusted_offset++; // 1.b
-        } else if (red_dim == (const unsigned long)dim) {
+        } else if (red_dim == (unsigned long)dim) {
           adjusted_offset = -1; // 1.a
           break;
         }
@@ -185,59 +186,200 @@ at::DimVector inversePermutation(
 
 } // namespace
 
+InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
+    const at::ArrayRef<IValue>& inputs) {
+  IdLookupReturn ret;
+  std::stringstream encoded_inputs;
+  for (const auto& input : inputs) {
+    if (input.isTensor()) {
+      auto input_tensor = input.toTensor();
+
+      encoded_inputs << ";";
+      auto sep = "";
+      for (auto size : input_tensor.sizes()) {
+        encoded_inputs << sep << size;
+        sep = ",";
+      }
+      encoded_inputs << "@";
+      sep = "";
+      for (auto stride : input_tensor.strides()) {
+        encoded_inputs << sep << stride;
+        sep = ",";
+      }
+    } else {
+      // encode s for scalar;
+      encoded_inputs << ";s";
+    }
+  }
+  auto& id_iter_pair = encoding_lookup_[encoded_inputs.str()];
+
+  // short-cut to leave LRU entry as is;
+  if (id_iter_pair.lru_iter == used_entry_.begin()) {
+    ret.id = id_iter_pair.id;
+    return ret;
+  }
+
+  if (id_iter_pair.id == 0) {
+    // no entry existed for given input set, set id for given entry
+    id_iter_pair.id = current_id_++;
+    if (used_entry_.size() == max_cache_size_) {
+      // pop least recently used cache;
+      const auto& remove_iter = encoding_lookup_.find(used_entry_.back());
+      used_entry_.pop_back();
+      ret.evict_id = remove_iter->second.id;
+      ret.eviction = true;
+      encoding_lookup_.erase(remove_iter);
+    }
+  } else {
+    used_entry_.erase(id_iter_pair.lru_iter);
+  }
+
+  ret.id = id_iter_pair.id;
+  id_iter_pair.lru_iter =
+      used_entry_.insert(used_entry_.begin(), encoded_inputs.str());
+  return ret;
+}
+
 FusionExecutorCache::FusionExecutorCache(
     std::unique_ptr<Fusion>&& fusion,
     at::Device device)
-    : device_(device), fusion_(std::move(fusion)) {}
+    : device_(device), fusion_(std::move(fusion)) {
+  FUSER_PERF_SCOPE("FusionExecutorCache::FusionExecutorCache");
+  // avoid putting `has_reduction_` in the initializer list
+  has_reduction_ = fusion_->hasReduction();
+}
 
-// TODO: dummy cache
 std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
-    const at::ArrayRef<IValue>& inputs) {
-  // caching strategy is different for pw-fusion and reduction-fusion.
-  if (fusion_->hasReduction()) {
-    // copy the fusion, since each FusionExecutor needs to manipulate the fusion
-    // in order to generate kernel.
-    Fusion fusion = *fusion_;
-    FusionGuard fg(&fusion);
-    TensorView* red_tv = nullptr;
-    for (auto expr : fusion.exprs()) {
-      if (expr->getExprType().has_value() &&
-          expr->getExprType().value() == ExprType::ReductionOp) {
-        red_tv = expr->outputs()[0]->as<TensorView>();
-        break;
+    const at::ArrayRef<IValue>& inputs,
+    size_t unique_id) {
+  FUSER_PERF_SCOPE("runFusionWithInputs");
+  LaunchParams launch_params;
+  if (code_to_fe_lookup_.count(unique_id) == 0) {
+    // enter when we get a new input set. We need to search for compatible
+    // entries in cached `FusionExecutor` or compile new one as needed.
+
+    // caching strategy is different for pw-fusion and reduction-fusion.
+    if (has_reduction_) {
+      // Grab the fusion to analyze for heuristics
+      FusionGuard fg(fusion_.get());
+
+      TensorView* reduction_tv = nullptr;
+      // Use dependency check to find the reduction tv as it returns used values
+      // instead of exprs.
+
+      // The call is relatively heavy weight, consider caching
+      auto used_vals = DependencyCheck::getAllValsBetween(
+          {fusion_->inputs().begin(), fusion_->inputs().end()},
+          fusion_->outputs());
+
+      // Find the reduction tensor view, make sure there's only one
+      for (auto val : used_vals) {
+        if (val->getValType().value() == ValType::TensorView) {
+          auto tv = val->as<TensorView>();
+          if (tv->hasReduction()) {
+            TORCH_INTERNAL_ASSERT(
+                reduction_tv == nullptr,
+                "Already found a reduction tensorview, cannot handle fusion of multiple reductions.");
+            reduction_tv = tv;
+          }
+        }
       }
+
+      TORCH_INTERNAL_ASSERT(
+          reduction_tv != nullptr,
+          "Could not find the reduction tensor view in the fusion.");
+
+      // Generate the reduction parameters
+      auto reduction_params =
+          getReductionHeuristics(fusion_.get(), inputs, reduction_tv);
+
+      TORCH_INTERNAL_ASSERT(
+          reduction_params.has_value(),
+          "Error getting reduction heuristics for scheduling.");
+
+      launch_params = reduction_params.value().lparams;
+
+      auto fusion_executor =
+          &red_fusion_executor_cache_[reduction_params.value()];
+
+      if (!fusion_executor->compiled()) {
+        // HEURISTIC NOT COMPILED, COMPILE A KERNEL
+        Fusion fusion = *fusion_;
+
+        FusionGuard fg(&fusion);
+
+        // Heavy weight call
+        auto used_vals = DependencyCheck::getAllValsBetween(
+            {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
+
+        TensorView* reduction_tv = nullptr;
+
+        for (auto val : used_vals) {
+          if (val->getValType().value() == ValType::TensorView) {
+            auto tv = val->as<TensorView>();
+            if (tv->hasReduction()) {
+              TORCH_INTERNAL_ASSERT(
+                  reduction_tv == nullptr,
+                  "Already found a reduction tensorview, cannot handle fusion of multiple reductions.");
+              reduction_tv = tv;
+            }
+          }
+        }
+
+        TORCH_INTERNAL_ASSERT(
+            reduction_tv != nullptr,
+            "Could not find the reduction tensor view in the fusion.");
+
+        // Heavy weight call
+        auto outputsOfReduction =
+            DependencyCheck::getAllOutputsOf({reduction_tv});
+
+        auto tv_entries =
+            ir_utils::filterByType<TensorView>(outputsOfReduction);
+
+        std::vector<TensorView*> tvOutputsOfReduction(
+            tv_entries.begin(), tv_entries.end());
+
+        scheduleReduction(
+            &fusion,
+            reduction_params.value(),
+            reduction_tv,
+            tvOutputsOfReduction);
+
+        // This means we have not found a previously generated kernel that's
+        // compatible with the new reduction params. We need to finish codegen.
+        CompileOptions options;
+        options.device = device_;
+        fusion_executor->compileFusion(&fusion, options);
+      }
+      // record new short cut to `FusionExecutor`
+      code_to_fe_lookup_[unique_id] = fusion_executor;
+
+    } else {
+      // Handle pointwise operations
+      if (!pw_fusion_executor_cache_) {
+        pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
+        CompileOptions options;
+        options.device = device_;
+        // no need to copy fusion_, as we are not generating more than 1 kernel
+        // for PW.
+        scheduleFusion(fusion_.get(), inputs);
+        pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
+      }
+      // record new short cut to `FusionExecutor`
+      code_to_fe_lookup_[unique_id] = pw_fusion_executor_cache_.get();
     }
-    auto reduction_params = scheduleReduction(&fusion, inputs, red_tv);
-    TORCH_INTERNAL_ASSERT(
-        reduction_params.has_value(),
-        "reduction schedule failed in `scheduleReduction`");
-    auto& fusion_executor =
-        red_fusion_executor_cache_[reduction_params.value()];
-    if (!fusion_executor.compiled()) {
-      // This means we have not found a previously generated kernel that's
-      // compatible with the new reduction params. We need to finish codegen.
-      CompileOptions options;
-      options.device = device_;
-      fusion_executor.compileFusion(&fusion, options);
-    }
-    return fusion_executor.runFusion(inputs);
-  } else {
-    if (!pw_fusion_executor_cache_) {
-      pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
-      CompileOptions options;
-      options.device = device_;
-      // no need to copy fusion_, as we are not generating more than 1 kernel
-      // for PW.
-      scheduleFusion(fusion_.get(), inputs);
-      pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
-    }
-    return pw_fusion_executor_cache_->runFusion(inputs);
   }
+
+  return code_to_fe_lookup_[unique_id]->runFusion(
+      inputs, launch_params, unique_id);
 }
 
 GraphCache::InputsRequirement::InputsRequirement(
     const std::shared_ptr<Graph>& graph,
     const std::vector<size_t>& reduction_axes) {
+  FUSER_PERF_SCOPE("InputsRequirement::InputsRequirement");
+
   // run over inputs to extract common types;
   TensorTypePtr acc_type = TensorType::get();
   for (const auto& input : graph->inputs()) {
@@ -256,16 +398,14 @@ GraphCache::InputsRequirement::InputsRequirement(
       vec_optional_ttp.emplace_back(c10::nullopt);
     }
   }
-  input_permutation_ = getPermutationPerSortedStride(acc_type);
-  output_permutation_ = inversePermutation(input_permutation_, reduction_axes);
-  TORCH_CHECK(
-      acc_type->device().has_value(), "requires fixed device for all inputs");
-  device_ = acc_type->device();
+  extractPermutation(acc_type, reduction_axes);
 }
 
 GraphCache::InputsRequirement::InputsRequirement(
     const at::ArrayRef<IValue>& inputs,
     const std::vector<size_t>& reduction_axes) {
+  FUSER_PERF_SCOPE("InputsRequirement::InputsRequirement");
+
   // run over inputs to extract common types;
   TensorTypePtr acc_type = TensorType::get();
   for (const auto& input : inputs) {
@@ -287,11 +427,7 @@ GraphCache::InputsRequirement::InputsRequirement(
       vec_optional_ttp.emplace_back(c10::nullopt);
     }
   }
-  input_permutation_ = getPermutationPerSortedStride(acc_type);
-  output_permutation_ = inversePermutation(input_permutation_, reduction_axes);
-  TORCH_CHECK(
-      acc_type->device().has_value(), "requires fixed device for all inputs");
-  device_ = acc_type->device();
+  extractPermutation(acc_type, reduction_axes);
 }
 
 bool GraphCache::InputsRequirement::requiresPermutation() {
@@ -302,10 +438,16 @@ bool GraphCache::InputsRequirement::requiresPermutation() {
     }
   }
   // Check if output agrees
-  const size_t output_rank = output_permutation_.size();
-  for (size_t i = 0; i < output_rank; i++) {
+  const size_t pw_output_rank = pw_output_permutation_.size();
+  for (size_t i = 0; i < pw_output_rank; i++) {
     TORCH_INTERNAL_ASSERT(
-        output_permutation_[i] == (long)i,
+        pw_output_permutation_[i] == (long)i,
+        "permutation of output and input is not consistent");
+  }
+  const size_t reduction_output_rank = reduction_output_permutation_.size();
+  for (size_t i = 0; i < reduction_output_rank; i++) {
+    TORCH_INTERNAL_ASSERT(
+        reduction_output_permutation_[i] == (long)i,
         "permutation of output and input is not consistent");
   }
   return false;
@@ -314,9 +456,12 @@ bool GraphCache::InputsRequirement::requiresPermutation() {
 // TODO: tests!
 bool GraphCache::InputsRequirement::complyWith(
     const InputsRequirement& expect) {
+  FUSER_PERF_SCOPE("InputsRequirement::complyWith");
+
   if (device_ != expect.device_ ||
       input_permutation_ != expect.input_permutation_ ||
-      output_permutation_ != expect.output_permutation_ ||
+      pw_output_permutation_ != expect.pw_output_permutation_ ||
+      reduction_output_permutation_ != expect.reduction_output_permutation_ ||
       vec_optional_ttp.size() != expect.vec_optional_ttp.size()) {
     return false;
   }
@@ -381,8 +526,22 @@ bool GraphCache::InputsRequirement::complyWith(
   return true;
 }
 
-FusionExecutorCache* GraphCache::createFusionExecutorCache(
+void GraphCache::InputsRequirement::extractPermutation(
+    const TensorTypePtr& acc_type,
+    const std::vector<size_t>& reduction_axes) {
+  input_permutation_ = getPermutationPerSortedStride(acc_type);
+  reduction_output_permutation_ =
+      inversePermutation(input_permutation_, reduction_axes);
+  pw_output_permutation_ = inversePermutation(input_permutation_, {});
+  TORCH_CHECK(
+      acc_type->device().has_value(), "requires fixed device for all inputs");
+  device_ = acc_type->device();
+}
+
+FusionExecutorCache* GraphCache::appendFusionExecutorCache(
     const InputsRequirement& input_stack) {
+  FUSER_PERF_SCOPE("createFusionExecutorCache");
+
   input_stacks_.emplace_back(input_stack);
   std::shared_ptr<Graph> parsing_graph = graph_->copy();
   // assign inputs on parsing_graph to accommodate legacy executor, where input
@@ -457,12 +616,6 @@ FusionExecutorCache* GraphCache::createFusionExecutorCache(
       // see [ NOTE - reduction in graph ] part 2.
       for (auto n : parsing_graph->nodes()) {
         if (isReductionNode(n)) {
-          // TODO: this is mostly redundant check, but it's compile time, we
-          //       leave it here to be safe;
-          TORCH_INTERNAL_ASSERT(
-              parsing_graph->outputs().size() == 1 &&
-                  parsing_graph->outputs()[0] == n->output(),
-              "supporfor graph with reduction is limited to single output from reduction node");
           auto dims_list = constant_as<c10::List<int64_t>>(n->input(1));
           TORCH_INTERNAL_ASSERT(
               dims_list.has_value(), "reduction axes should be constant");
@@ -496,10 +649,12 @@ FusionExecutorCache* GraphCache::createFusionExecutorCache(
 
 GraphCache::GraphCache(std::shared_ptr<Graph> graph)
     : graph_(std::move(graph)) {
+  FUSER_PERF_SCOPE("GraphCache::GraphCache");
+
   // [ NOTE - reduction in graph ]
   //
   // reduction complicates our permutation in integration, it addes two things:
-  // 1. we need to adjust output_permutation_;
+  // 1. we need to adjust xxx_output_permutation_;
   //    because of dimension elimination during permutation (not necessarily,
   //    given the `keepdim` argument.) this needs to be accommodated later when
   //    we added the support.
@@ -511,50 +666,99 @@ GraphCache::GraphCache(std::shared_ptr<Graph> graph)
   // compile a kernel if we have enough information from graph (profiling
   // record)
   if (IsNewExecutorEnabled()) {
-    createFusionExecutorCache(
+    appendFusionExecutorCache(
         InputsRequirement(graph_, toVector(reduction_axes_)));
   }
 }
 
 std::vector<at::Tensor> GraphCache::runGraphWithInputs(
     const at::ArrayRef<IValue>& inputs) {
-  InputsRequirement input_stack(inputs, toVector(reduction_axes_));
+  FUSER_PERF_SCOPE("runGraphWithInputs");
+  // get unique id `unique_id` for given input set `inputs`;
+  auto id_lookup_ret = inputs_id_lookup_.lookupId(inputs);
+  const size_t unique_id = id_lookup_ret.id;
+
+  // if we went over the cache size for short-cut, we evict entries using LRU;
+  if (id_lookup_ret.eviction) {
+    auto index_lookup_iter = code_to_index_lookup_.find(id_lookup_ret.evict_id);
+    TORCH_INTERNAL_ASSERT(
+        index_lookup_iter != code_to_index_lookup_.end(),
+        "evicting cache entry not found in lookup table");
+    // evict nested cache in FusionExecutorCache
+    fe_cache_[index_lookup_iter->second]->evictCache(index_lookup_iter->first);
+    code_to_index_lookup_.erase(index_lookup_iter);
+  }
+
   FusionExecutorCache* fusion_executor_cache = nullptr;
 
-  // TODO: hash indexing;
-  for (size_t i = 0; i < fe_cache_.size(); i++) {
-    if (input_stack.complyWith(input_stacks_[i])) {
-      fusion_executor_cache = fe_cache_[i].get();
-      break;
+  if (code_to_index_lookup_.count(unique_id) == 0) {
+    InputsRequirement input_stack(inputs, toVector(reduction_axes_));
+    for (size_t i = 0; i < fe_cache_.size(); i++) {
+      if (input_stack.complyWith(input_stacks_[i])) {
+        // found compliable fe_cache_ entry
+        fusion_executor_cache = fe_cache_[i].get();
+        // record short cut to designated fusion executor
+        code_to_index_lookup_[unique_id] = i;
+        break;
+      }
     }
+    if (!fusion_executor_cache) {
+      // This is the ugly bit, each level of cache has their own entry. At this
+      // point, we are creating an instance of FusionExecutorCache as well as a
+      // cache entry for GraphCache;
+      // But we are not creating any cache entry for nested structures. We only
+      // create cache entry below when we later call
+      // `fusion_executor_cache->runFusionWithInputs`
+      fusion_executor_cache = appendFusionExecutorCache(input_stack);
+      // record short cut to designated fusion executor
+      code_to_index_lookup_[unique_id] = fe_cache_.size() - 1;
+    }
+  } else {
+    // take short cut to designated fusion executor
+    fusion_executor_cache = fe_cache_[code_to_index_lookup_[unique_id]].get();
   }
-  if (!fusion_executor_cache) {
-    fusion_executor_cache = createFusionExecutorCache(input_stack);
-  }
+  InputsRequirement* input_requirement =
+      &input_stacks_[code_to_index_lookup_[unique_id]];
 
   // GraphCache need to permute inputs/outputs to accommodate dimension
   // coalescing
-  if (input_stack.requiresPermutation()) {
+  if (input_requirement->requiresPermutation()) {
     std::vector<IValue> permuted_inputs;
     permuted_inputs.reserve(inputs.size());
     for (const auto& input : inputs) {
       if (input.isTensor()) {
         permuted_inputs.emplace_back(
-            input.toTensor().permute(input_stack.input_permutation_));
+            input.toTensor().permute(input_requirement->input_permutation_));
       } else {
         permuted_inputs.emplace_back(input);
       }
     }
-    auto outputs = fusion_executor_cache->runFusionWithInputs(permuted_inputs);
+    auto outputs =
+        fusion_executor_cache->runFusionWithInputs(permuted_inputs, unique_id);
     std::vector<at::Tensor> permuted_outputs;
     permuted_outputs.reserve(outputs.size());
     for (const auto& output : outputs) {
-      permuted_outputs.emplace_back(
-          output.permute(input_stack.output_permutation_));
+      // This is to address the issue that not all outputs from a reduction
+      // fusion are reduced tensor; We support intermediate tensors to be output
+      if (static_cast<size_t>(output.dim()) ==
+          input_requirement->pw_output_permutation_.size()) {
+        permuted_outputs.emplace_back(
+            output.permute(input_requirement->pw_output_permutation_));
+      } else if (
+          static_cast<size_t>(output.dim()) ==
+          input_requirement->reduction_output_permutation_.size()) {
+        permuted_outputs.emplace_back(
+            output.permute(input_requirement->reduction_output_permutation_));
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Something went wrong with integration permutation, can't find a consistent permutation for output in fusion",
+            *graph_);
+      }
     }
     return permuted_outputs;
   } else {
-    return fusion_executor_cache->runFusionWithInputs(inputs);
+    return fusion_executor_cache->runFusionWithInputs(inputs, unique_id);
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h
index 1b8233846dda..e0e8a75ea5cd 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h
@@ -8,12 +8,73 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <type_traits>
+#include <unordered_map>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+//! Encoding an input set to unique id, which is used to short-cut cache entry
+//! selection in our nested cache implementation to cut off overhead.
+//!
+//! We have implemented naive LRU cache eviction policy here, since each entry
+//! in `InputsIdLookup` is attached to a static input shape/stride, and could
+//! grow gigantic when we have input shapes that does not stabalize to a finite
+//! set.
+//!
+//! \note the uniqueness of the ide generated for a given input set is only
+//!   local to the instance of `InputsIdLookup`.
+//!
+class TORCH_CUDA_API InputsIdLookup {
+ public:
+  // constructor where maximum cache size is fixed during init
+  explicit InputsIdLookup(size_t max_cache_size = 10)
+      : max_cache_size_(max_cache_size){};
+
+  // struct to hold return value for lookupId.
+  struct IdLookupReturn {
+    size_t id = 0;
+    size_t evict_id = 0;
+    bool eviction = false;
+  };
+
+  // encode each input sets to with an unique id;
+  // Returned data structure also indicates whether eviction has happened within
+  // the lookup cache. This is needed because lookup shortcut is also cached in
+  // nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`.
+  // see [ Note -- 2 level cache implementation ]
+  IdLookupReturn lookupId(const at::ArrayRef<IValue>& inputs);
+
+  // debugging API
+  size_t size() const {
+    return encoding_lookup_.size();
+  }
+
+ private:
+  // entry stored in `encoding_lookup_` to implement LRU
+  struct EncodingEntry {
+    size_t id;
+    std::list<std::string>::iterator lru_iter;
+  };
+
+  // maximum cache size for LRU
+  const size_t max_cache_size_;
+
+  // next available unique id, we monotonically increase `current_id_` avoid
+  // conflicts
+  size_t current_id_ = 1;
+
+  // entry in the cache, This is used to implement LRU cache, where entries in
+  // the list is ordered by their recent usage (freshly used entry is placed at
+  // the beginning)
+  std::list<std::string> used_entry_;
+
+  // map from `std::string` to a unique id `size_t` (packaged in `EncodingEntry`
+  // ). We store an iterator to `used_entry_` to implement LRU
+  std::unordered_map<std::string, EncodingEntry> encoding_lookup_;
+};
+
 // [ Note -- 2 level cache implementation ]
 //
 // 2 level hierarchically nested cache is to handle the code generation and
@@ -65,7 +126,19 @@ class FusionExecutorCache {
 
   // Execute fusion graph with given inputs, create `FusionExecutor` as needed;
   std::vector<at::Tensor> runFusionWithInputs(
-      const at::ArrayRef<IValue>& inputs);
+      const at::ArrayRef<IValue>& inputs,
+      size_t unique_id);
+
+  // evict cached short cut entry in `code_to_fe_lookup_`;
+  inline void evictCache(size_t cache_id) {
+    auto iter = code_to_fe_lookup_.find(cache_id);
+    TORCH_INTERNAL_ASSERT(
+        iter != code_to_fe_lookup_.end(),
+        "evict cache failed to find an entry");
+    // evict nested lookup entry in nested FusionExecutor
+    (iter->second)->evictCache(cache_id);
+    code_to_fe_lookup_.erase(iter);
+  };
 
  private:
   // device_ where compiled binaries are loaded on & inputs are expected to
@@ -75,6 +148,16 @@ class FusionExecutorCache {
   // original un-scheduled `Fusion`;
   std::unique_ptr<Fusion> fusion_;
 
+  // I'm trading the const model in favor of assigning `has_reduction_` in the
+  // body of constructor, instead of the initializer list;
+  // Because of the move statement used in the constructor, it's tricky to
+  // maintain the code if we have `has_reduction_` as a const member and
+  // initizlize it in the initializer list, where the order of initialization
+  // is controled by the order of declaration instead of their order in the list
+  //
+  // cache fusion->hasReduction() because it's expensive;
+  bool has_reduction_;
+
   // TODO: ugly logic for now. We should integrate the hashing of cache for
   //       different kernels. (alternatively we could do so in scheduler).
   // ugly bits now:
@@ -84,14 +167,12 @@ class FusionExecutorCache {
   //    `pw_fusion_executor_cache_`
   // 2. For reduction fusion we have a hash table with ReductionParams as entry
   //    pointing to the actual `FusionExecutor` in `red_fusion_executor_cache_`
-  //
-  // Unfortunately, at run-time in order to search compatible `FusionExecutor`,
-  // we have to call `scheduleReduction` in order to get an instance of
-  // `ReductionParams` for indexing. This is not very efficient. Hence the TODO:
-  // add a direct cache from inputs shapes to `FusionExecutor` entries.
   std::unique_ptr<FusionExecutor> pw_fusion_executor_cache_;
   std::unordered_map<ReductionParams, FusionExecutor, ReductionParamsHash>
       red_fusion_executor_cache_;
+
+  // short cut to FusionExecutor for input set encoded with id;
+  std::unordered_map<size_t, FusionExecutor*> code_to_fe_lookup_;
 };
 
 class GraphCache {
@@ -121,7 +202,8 @@ class GraphCache {
 
     // common permutation order used for dimension coalescing;
     at::DimVector input_permutation_;
-    at::DimVector output_permutation_;
+    at::DimVector pw_output_permutation_;
+    at::DimVector reduction_output_permutation_;
 
     // construct InputsRequirement from `Graph`, this is used for constructing
     // `GraphCache` entry using profiling record
@@ -136,18 +218,23 @@ class GraphCache {
         const at::ArrayRef<IValue>& inputs,
         const std::vector<size_t>& reduction_axes);
 
-    // bool operator==(const InputsRequirement& other);
     bool complyWith(const InputsRequirement& expect);
 
     // helper function used at run-time to check whether a common permutation is
     // present, this is used to take the short-cut to skip permutation logic.
     bool requiresPermutation();
+
+    // extract permutation for input output tensor from accumulcated tensor type
+    // pointer on all inputs;
+    void extractPermutation(
+        const TensorTypePtr& acc_type,
+        const std::vector<size_t>& reduction_axes);
   };
 
   // construct FusionExecutorCache per InputsRequirement.
   // This function makes sure that we properly insert both `input_stacks_` and
   // `fe_cache_` at the same time.
-  FusionExecutorCache* createFusionExecutorCache(
+  FusionExecutorCache* appendFusionExecutorCache(
       const InputsRequirement& input_stack);
 
  private:
@@ -156,10 +243,16 @@ class GraphCache {
   // TODO: poor name, we should use `eliminated_axes_` instead;
   at::DimVector reduction_axes_;
 
+  // short cut to index of stack for input set encoded with id;
+  std::unordered_map<size_t, size_t> code_to_index_lookup_;
+
   // TODO: we should really hash instead of iterative check. Optimize later...
   //       unordered_map<InputsRequirement, FusionExecutorCache>;
   std::vector<InputsRequirement> input_stacks_;
   std::vector<std::unique_ptr<FusionExecutorCache>> fe_cache_;
+
+  // inputs to unique_id lookup table;
+  InputsIdLookup inputs_id_lookup_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
index c7c6d0ec39f0..7941f369d4ff 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -1,12 +1,10 @@
 
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
-// TODO(kir): remove
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -14,12 +12,14 @@ namespace kir {
 
 NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) {
   std::string parallel_dim = stringifyThreadSize(p_type);
-  return new NamedScalar(parallel_dim, DataType::Int);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  return ir_builder.create<NamedScalar>(parallel_dim, DataType::Int);
 }
 
 NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) {
   std::string parallel_ind = stringifyThread(p_type);
-  return new NamedScalar(parallel_ind, DataType::Int);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  return ir_builder.create<NamedScalar>(parallel_ind, DataType::Int);
 }
 
 c10::optional<ParallelType> NamedScalar::getParallelDim() const {
@@ -56,27 +56,19 @@ c10::optional<ParallelType> NamedScalar::getParallelIndex() const {
   return c10::nullopt;
 }
 
-IterDomain::IterDomain(Val* start, Val* extent)
+IterDomain::IterDomain(Passkey, Val* start, Val* extent)
     : Val(ValType::KirIterDomain, DataType::Int, true, true),
       start_(start),
       extent_(extent) {}
 
-IterDomain::IterDomain(const fuser::IterDomain* iter_domain)
+IterDomain::IterDomain(Passkey, const fuser::IterDomain* iter_domain)
     : Val(iter_domain),
-      start_(lowerValue(iter_domain->start())),
-      extent_(lowerValue(iter_domain->rawExtent())),
+      start_(GpuLower::lowerValue(iter_domain->start())),
+      extent_(GpuLower::lowerValue(iter_domain->rawExtent())),
       parallel_type_(iter_domain->getParallelType()),
       iter_type_(iter_domain->getIterType()),
       is_rfactor_domain_(iter_domain->isRFactorProduct()) {}
 
-IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      start_(ir_cloner->clone(src->start_)),
-      extent_(ir_cloner->clone(src->extent_)),
-      parallel_type_(src->parallel_type_),
-      iter_type_(src->iter_type_),
-      is_rfactor_domain_(src->is_rfactor_domain_) {}
-
 Val* IterDomain::extent() const {
   TORCH_CHECK(isLoweredVal(extent_));
   if (isThread()) {
@@ -90,20 +82,21 @@ Val* IterDomain::extent() const {
   return extent_;
 }
 
-TensorDomain::TensorDomain(std::vector<IterDomain*> domain)
+TensorDomain::TensorDomain(Passkey, std::vector<IterDomain*> domain)
     : Val(ValType::KirTensorDomain), root_domain_(std::move(domain)) {
   domain_ = root_domain_;
   resetDomains();
 }
 
-TensorDomain::TensorDomain(const fuser::TensorDomain* tensor_domain)
+TensorDomain::TensorDomain(Passkey, const fuser::TensorDomain* tensor_domain)
     : Val(tensor_domain), contiguity_(tensor_domain->contiguity()) {
   const auto lowerIterDomains =
       [](const std::vector<fuser::IterDomain*>& domains) {
         std::vector<IterDomain*> lowered_domains;
         lowered_domains.reserve(domains.size());
         for (const auto iter_domain : domains) {
-          lowered_domains.push_back(lowerValue(iter_domain)->as<IterDomain>());
+          lowered_domains.push_back(
+              GpuLower::lowerValue(iter_domain)->as<IterDomain>());
         }
         return lowered_domains;
       };
@@ -115,15 +108,6 @@ TensorDomain::TensorDomain(const fuser::TensorDomain* tensor_domain)
   rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain());
 }
 
-TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      root_domain_(ir_cloner->clone(src->root_domain_)),
-      domain_(ir_cloner->clone(src->domain_)),
-      no_bcast_domain_(ir_cloner->clone(src->no_bcast_domain_)),
-      no_reduction_domain_(ir_cloner->clone(src->no_reduction_domain_)),
-      rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)),
-      contiguity_(src->contiguity()) {}
-
 bool TensorDomain::hasReduction() const {
   return no_reduction_domain_.size() != domain_.size();
 }
@@ -140,6 +124,12 @@ bool TensorDomain::hasGridReduction() const {
   });
 }
 
+bool TensorDomain::hasBlockBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isThreadDim();
+  });
+}
+
 bool TensorDomain::hasBroadcast() const {
   return no_bcast_domain_.size() != domain_.size();
 }
@@ -175,31 +165,20 @@ std::vector<IterDomain*> TensorDomain::noBroadcasts(
   return no_broadcast_domains;
 }
 
-TensorView::TensorView(const fuser::TensorView* tv) : Val(tv), fuser_tv_(tv) {
-  domain_ = lowerValue(tv->domain())->as<TensorDomain>();
+TensorView::TensorView(Passkey, const fuser::TensorView* tv)
+    : Val(tv), fuser_tv_(tv) {
+  domain_ = GpuLower::lowerValue(tv->domain())->as<TensorDomain>();
   memory_type_ = tv->getMemoryType();
 }
 
-TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      domain_(ir_cloner->clone(src->domain_)),
-      memory_type_(src->memory_type_),
-      fuser_tv_(src->fuser_tv_) {}
-
-UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in)
+UnaryOp::UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in)
     : Expr(ExprType::KirUnaryOp), unary_op_type_{type}, out_{out}, in_{in} {
   addOutput(out);
   addInput(in);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      unary_op_type_(src->unary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
-BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
+BinaryOp::BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs)
     : Expr(ExprType::KirBinaryOp),
       binary_op_type_{type},
       out_{out},
@@ -211,14 +190,13 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      binary_op_type_(src->binary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      lhs_(ir_cloner->clone(src->lhs_)),
-      rhs_(ir_cloner->clone(src->rhs_)) {}
-
-TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
+TernaryOp::TernaryOp(
+    Passkey,
+    TernaryOpType type,
+    Val* out,
+    Val* in1,
+    Val* in2,
+    Val* in3)
     : Expr(ExprType::KirTernaryOp),
       ternary_op_type_{type},
       out_{out},
@@ -232,36 +210,24 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      ternary_op_type_(src->ternary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      in1_(ir_cloner->clone(src->in1_)),
-      in2_(ir_cloner->clone(src->in2_)),
-      in3_(ir_cloner->clone(src->in3_)) {}
-
 ReductionOp::ReductionOp(
+    Passkey,
     BinaryOpType reduction_op_type,
     Val* init,
     Val* out,
-    Val* in)
+    Val* in,
+    Bool* pred)
     : Expr(ExprType::KirReductionOp),
       reduction_op_type_(reduction_op_type),
       init_(init),
       out_(out),
-      in_(in) {
+      in_(in),
+      pred_(pred) {
   addOutput(out);
   addInput(in);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      reduction_op_type_(src->reduction_op_type_),
-      init_(ir_cloner->clone(src->init_)),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
 std::vector<IterDomain*> ReductionOp::getReductionDomains() const {
   // out is a TensorIndex after lowering
   const auto out_val = out()->as<kir::TensorIndex>()->view();
@@ -288,7 +254,7 @@ std::unordered_map<ParallelType, IterDomain*, TypeHash> ReductionOp::
   return parallel_domains;
 }
 
-BroadcastOp::BroadcastOp(Val* out, Val* in)
+BroadcastOp::BroadcastOp(Passkey, Val* out, Val* in)
     : Expr(ExprType::KirBroadcastOp), out_(out), in_(in) {
   TORCH_CHECK(in->getValType().value() == ValType::TensorIndex);
   TORCH_CHECK(out->getValType().value() == ValType::TensorIndex);
@@ -297,16 +263,12 @@ BroadcastOp::BroadcastOp(Val* out, Val* in)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-BroadcastOp::BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
 TensorIndex::TensorIndex(
+    Passkey,
     const fuser::TensorView* view,
     std::vector<Val*> indices)
     : Val(ValType::TensorIndex, view->getDataType().value(), true, true),
-      view_(lowerValue(view)->as<TensorView>()),
+      view_(GpuLower::lowerValue(view)->as<TensorView>()),
       indices_(indices) {
   TORCH_INTERNAL_ASSERT(
       std::all_of(
@@ -320,13 +282,9 @@ TensorIndex::TensorIndex(
       "Cannot index with a value other than an int.");
 }
 
-TensorIndex::TensorIndex(const TensorIndex* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      view_(ir_cloner->clone(src->view_)),
-      indices_(ir_cloner->clone(src->indices_)) {}
-
-Scope::Scope(const Scope* src, IrCloner* ir_cloner)
-    : exprs_(ir_cloner->clone(src->exprs_)) {}
+Sync::Sync(Passkey, bool war_sync) : Expr(ExprType::Sync), war_sync_(war_sync) {
+  name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
+}
 
 void Scope::insert_before(Expr* ref, Expr* expr) {
   auto it = exprs_.begin();
@@ -373,9 +331,9 @@ void Scope::clear() {
 }
 
 ForLoop::ForLoop(
+    Passkey,
     Val* index,
     IterDomain* iter_domain,
-    const std::vector<Expr*>& body,
     Expr* parent_scope)
     : Expr(ExprType::ForLoop),
       index_{index},
@@ -386,18 +344,8 @@ ForLoop::ForLoop(
   addInput(index);
   addInput(iter_domain);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
-  for (Expr* expr : body) {
-    body_.push_back(expr);
-  }
 }
 
-ForLoop::ForLoop(const ForLoop* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      index_(ir_cloner->clone(src->index_)),
-      iter_domain_(ir_cloner->clone(src->iter_domain_)),
-      body_(&src->body_, ir_cloner),
-      parent_scope_(ir_cloner->clone(src->parent_scope_)) {}
-
 void ForLoop::setParentScope(Expr* scope) {
   TORCH_INTERNAL_ASSERT(
       !scope_utils::exprInScope(parentScope(), this),
@@ -405,28 +353,12 @@ void ForLoop::setParentScope(Expr* scope) {
   parent_scope_ = scope;
 }
 
-IfThenElse::IfThenElse(
-    Bool* cond,
-    const std::vector<Expr*>& if_body,
-    const std::vector<Expr*>& else_body,
-    Expr* parent_scope)
+IfThenElse::IfThenElse(Passkey, Bool* cond, Expr* parent_scope)
     : Expr(ExprType::IfThenElse), cond_{cond}, parent_scope_(parent_scope) {
   addInput(cond);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
-
-  for (auto* expr : if_body)
-    body_.push_back(expr);
-  for (auto* expr : else_body)
-    else_body_.push_back(expr);
 }
 
-IfThenElse::IfThenElse(const IfThenElse* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      cond_(src->cond_),
-      body_(&src->body_, ir_cloner),
-      else_body_(&src->else_body_, ir_cloner),
-      parent_scope_(ir_cloner->clone(src->parent_scope_)) {}
-
 void IfThenElse::setParentScope(Expr* scope) {
   TORCH_INTERNAL_ASSERT(
       !scope_utils::exprInScope(parentScope(), this),
@@ -443,11 +375,17 @@ Val* TensorIndex::index(int i) const {
   return indices_[i];
 }
 
-Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
+Allocate::Allocate(
+    Passkey,
+    Val* buffer,
+    MemoryType memory_type,
+    Val* size,
+    bool zero_init)
     : Expr(ExprType::Allocate),
       buffer_(buffer),
       memory_type_(memory_type),
-      size_(size) {
+      size_(size),
+      zero_init_(zero_init) {
   if (size_ != nullptr) {
     TORCH_INTERNAL_ASSERT(
         size_->isOneInt() ||
@@ -455,16 +393,20 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
         "Cannot allocate a non-TensorView buffer with a size != 1, received buffer: ",
         buffer_);
   } else {
-    TORCH_CHECK(buffer_->getValType().value() == ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        buffer_->getValType().value() == ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        buffer_->as<TensorView>()->memoryType() == memory_type_);
+    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
     const auto domain = buffer_->as<TensorView>()->domain();
-    size_ = domain->nDims() == 0 ? new Int(1) : domain->axis(0)->extent();
+    size_ = domain->nDims() == 0 ? ir_builder.create<Int>(1)
+                                 : domain->axis(0)->extent();
     for (size_t i = 1; i < domain->nDims(); i++) {
-      size_ = mulExpr(size_, domain->axis(i)->extent());
+      size_ = ir_builder.mulExpr(size_, domain->axis(i)->extent());
     }
   }
 
-  if ((memory_type_ == MemoryType::Local ||
-       memory_type_ == MemoryType::Shared)) {
+  if (memory_type_ == MemoryType::Local) {
     if (!size_->isConstScalar()) {
       TORCH_INTERNAL_ASSERT(
           false,
@@ -480,151 +422,36 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-Allocate::Allocate(const Allocate* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      buffer_(ir_cloner->clone(src->buffer_)),
-      memory_type_(src->memory_type_),
-      size_(ir_cloner->clone(src->size_)) {}
-
-Sync::Sync() : Expr(ExprType::Sync) {
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
-}
-
-Sync::Sync(const Sync* src, IrCloner* ir_cloner) : Expr(src, ir_cloner) {}
-
-GridReduction::GridReduction(ReductionOp* reduction_op)
+GridReduction::GridReduction(Passkey, ReductionOp* reduction_op)
     : Expr(ExprType::GridReduction), reduction_op_(reduction_op) {
   TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
 GridReduction::GridReduction(
+    Passkey,
     ReductionOp* reduction_op,
-    kir::Allocate* reduction_buffer,
-    kir::Allocate* sync_buffer)
+    Allocate* reduction_buffer,
+    Allocate* sync_buffer,
+    Bool* pred)
     : Expr(ExprType::GridReduction),
       reduction_op_(reduction_op),
       reduction_buffer_(reduction_buffer),
-      sync_buffer_(sync_buffer) {}
-
-GridReduction::GridReduction(const GridReduction* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      reduction_op_(ir_cloner->clone(src->reduction_op_)),
-      reduction_buffer_(ir_cloner->clone(src->reduction_buffer_)),
-      sync_buffer_(ir_cloner->clone(src->sync_buffer_)) {}
+      sync_buffer_(sync_buffer),
+      pred_(pred) {}
 
 std::string GridReduction::getPredicateFlagName(const TensorView* val) {
   std::stringstream ss;
-  ss << "T" << val->name() << "pred";
+  ss << "T" << val->name() << "_pred";
   return ss.str();
 }
 
+// TODO(kir): remove this
 std::string GridReduction::getPredicateFlagName(const fuser::TensorView* val) {
   std::stringstream ss;
-  ss << "T" << val->name() << "pred";
+  ss << "T" << val->name() << "_pred";
   return ss.str();
 }
 
-bool isLoweredScalar(const Val* val) {
-  switch (val->getValType().value()) {
-    case ValType::KirNamedScalar:
-    case ValType::KirScalar:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool isLoweredVal(const Val* val) {
-  switch (val->getValType().value()) {
-    case ValType::TensorIndex:
-    case ValType::KirNamedScalar:
-    case ValType::KirScalar:
-    case ValType::KirTensorDomain:
-    case ValType::KirIterDomain:
-    case ValType::KirTensorView:
-      return true;
-    default:
-      return false;
-  }
-}
-
-namespace {
-
-Val* newResult(const Val* lhs, const Val* rhs) {
-  TORCH_CHECK(isLoweredScalar(lhs));
-  TORCH_CHECK(isLoweredScalar(rhs));
-  TORCH_CHECK(lhs->getDataType() == rhs->getDataType());
-
-  // Allocate a compatible result value
-  switch (lhs->getDataType().value()) {
-    case DataType::Bool:
-      return new Bool(c10::nullopt);
-    case DataType::Float:
-      return new Float(c10::nullopt);
-    case DataType::Half:
-      return new Half(c10::nullopt);
-    case DataType::Int:
-      return new Int(c10::nullopt);
-    default:
-      TORCH_CHECK(false, "Unexpected data type");
-  }
-}
-
-Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  auto result = newResult(lhs, rhs);
-  new BinaryOp(op_type, result, lhs, rhs);
-  return result;
-}
-
-Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  auto result = new Bool(c10::nullopt);
-  new BinaryOp(op_type, result, lhs, rhs);
-  return result;
-}
-
-} // namespace
-
-Val* lowerValue(const Val* val) {
-  TORCH_INTERNAL_ASSERT(!isLoweredVal(val), val, " is already lowered.");
-  return GpuLower::lowerValue(val);
-}
-
-Val* andExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::And, lhs, rhs);
-}
-
-Val* eqExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
-}
-
-Val* ltExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
-}
-
-Val* addExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
-}
-
-Val* subExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
-}
-
-Val* mulExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
-}
-
-Val* divExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
-}
-
-Val* ceilDivExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
-}
-
-Val* modExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
-}
-
 } // namespace kir
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
index ef7c455ef8fb..e51bde37d285 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -1,8 +1,6 @@
 
 #pragma once
 
-#include <torch/csrc/WindowsTorchApiMacro.h>
-
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 // TODO(kir): remove these once the Kernel IR is separated from Fusion IR
@@ -12,6 +10,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
 
 #include <c10/util/Optional.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <string>
 #include <unordered_map>
@@ -22,17 +21,27 @@ namespace jit {
 namespace fuser {
 namespace kir {
 
+class IrBuilder;
+
+//! Token used to restrict the access to Kernel IR constructors
+//!
+//! Granular "friendship" token, used to implement the "passkey" idiom:
+//! https://www.spiria.com/en/blog/desktop-software/passkey-idiom-and-better-friendship-c
+//! https://arne-mertz.de/2016/10/passkey-idiom
+//!
+class Passkey {
+  friend class IrBuilder;
+  Passkey() {}
+};
+
 class TORCH_CUDA_API NamedScalar : public Val {
  public:
-  NamedScalar(std::string name, DataType dtype)
+  NamedScalar(Passkey, std::string name, DataType dtype)
       : Val(ValType::KirNamedScalar, dtype, true, true), name_(name) {}
 
-  explicit NamedScalar(const fuser::NamedScalar* node)
+  explicit NamedScalar(Passkey, const fuser::NamedScalar* node)
       : Val(node), name_(node->name()) {}
 
-  NamedScalar(const NamedScalar* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), name_(src->name_) {}
-
   const std::string& name() const {
     return name_;
   }
@@ -57,16 +66,13 @@ class TORCH_CUDA_API NamedScalar : public Val {
 
 class TORCH_CUDA_API Bool : public Val {
  public:
-  explicit Bool(const c10::optional<bool>& value)
+  explicit Bool(Passkey, const c10::optional<bool>& value)
       : Val(ValType::KirScalar, DataType::Bool, true, true),
         maybe_value_(value) {}
 
-  explicit Bool(const fuser::Bool* node)
+  explicit Bool(Passkey, const fuser::Bool* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Bool(const Bool* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -85,16 +91,13 @@ class TORCH_CUDA_API Float : public Val {
  public:
   using ScalarType = double;
 
-  explicit Float(const c10::optional<ScalarType>& value)
+  explicit Float(Passkey, const c10::optional<ScalarType>& value)
       : Val(ValType::KirScalar, DataType::Float, true, true),
         maybe_value_(value) {}
 
-  explicit Float(const fuser::Float* node)
+  explicit Float(Passkey, const fuser::Float* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Float(const Float* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -111,16 +114,13 @@ class TORCH_CUDA_API Float : public Val {
 
 class TORCH_CUDA_API Half : public Val {
  public:
-  explicit Half(const c10::optional<float>& value)
+  explicit Half(Passkey, const c10::optional<float>& value)
       : Val(ValType::KirScalar, DataType::Half, true, true),
         maybe_value_(value) {}
 
-  explicit Half(const fuser::Half* node)
+  explicit Half(Passkey, const fuser::Half* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Half(const Half* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -139,16 +139,13 @@ class TORCH_CUDA_API Int : public Val {
  public:
   using ScalarType = int64_t;
 
-  explicit Int(const c10::optional<ScalarType>& value)
+  explicit Int(Passkey, const c10::optional<ScalarType>& value)
       : Val(ValType::KirScalar, DataType::Int, true, true),
         maybe_value_(value) {}
 
-  explicit Int(const fuser::Int* node, bool /*avoid_zero_ambiguity*/)
+  explicit Int(Passkey, const fuser::Int* node, bool /*avoid_zero_ambiguity*/)
       : Val(node), maybe_value_(node->value()) {}
 
-  Int(const Int* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -165,11 +162,9 @@ class TORCH_CUDA_API Int : public Val {
 
 class TORCH_CUDA_API IterDomain : public Val {
  public:
-  IterDomain(Val* start, Val* extent);
+  IterDomain(Passkey, Val* start, Val* extent);
 
-  explicit IterDomain(const fuser::IterDomain* iter_domain);
-
-  IterDomain(const IterDomain* src, IrCloner* ir_cloner);
+  explicit IterDomain(Passkey, const fuser::IterDomain* iter_domain);
 
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
@@ -233,11 +228,9 @@ class TORCH_CUDA_API IterDomain : public Val {
 
 class TORCH_CUDA_API TensorDomain : public Val {
  public:
-  explicit TensorDomain(std::vector<IterDomain*> domain);
-
-  explicit TensorDomain(const fuser::TensorDomain* tensor_domain);
+  explicit TensorDomain(Passkey, std::vector<IterDomain*> domain);
 
-  TensorDomain(const TensorDomain* src, IrCloner* ir_cloner);
+  explicit TensorDomain(Passkey, const fuser::TensorDomain* tensor_domain);
 
   std::vector<IterDomain*>::size_type nDims() const {
     return domain_.size();
@@ -262,6 +255,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
@@ -288,6 +282,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
   IterDomain* axis(int i) const;
 
+  // TODO(kir): overloading non-static and static methods is not a good idea
   static std::vector<IterDomain*> noReductions(const std::vector<IterDomain*>&);
   static std::vector<IterDomain*> noBroadcasts(const std::vector<IterDomain*>&);
 
@@ -302,15 +297,13 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
 class TORCH_CUDA_API TensorView : public Val {
  public:
-  explicit TensorView(const fuser::TensorView* tv);
-
-  TensorView(const TensorView* src, IrCloner* ir_cloner);
+  explicit TensorView(Passkey, const fuser::TensorView* tv);
 
   TensorDomain* domain() const {
     return domain_;
   }
 
-  MemoryType getMemoryType() const {
+  MemoryType memoryType() const {
     return memory_type_;
   }
 
@@ -321,7 +314,7 @@ class TORCH_CUDA_API TensorView : public Val {
 
  private:
   TensorDomain* domain_ = nullptr;
-  MemoryType memory_type_ = MemoryType::Global;
+  MemoryType memory_type_ = MemoryType::Local;
 
   // TODO(kir): remove temporary hack
   const fuser::TensorView* fuser_tv_ = nullptr;
@@ -329,9 +322,7 @@ class TORCH_CUDA_API TensorView : public Val {
 
 class TORCH_CUDA_API UnaryOp : public Expr {
  public:
-  UnaryOp(UnaryOpType type, Val* out, Val* in);
-
-  UnaryOp(const UnaryOp* src, IrCloner* ir_cloner);
+  UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in);
 
   Val* out() const {
     return out_;
@@ -353,9 +344,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
 
 class TORCH_CUDA_API BinaryOp : public Expr {
  public:
-  BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs);
-
-  BinaryOp(const BinaryOp* src, IrCloner* ir_cloner);
+  BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs);
 
   Val* out() const {
     return out_;
@@ -382,9 +371,13 @@ class TORCH_CUDA_API BinaryOp : public Expr {
 
 class TORCH_CUDA_API TernaryOp : public Expr {
  public:
-  TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3);
-
-  TernaryOp(const TernaryOp* src, IrCloner* ir_cloner);
+  TernaryOp(
+      Passkey,
+      TernaryOpType type,
+      Val* out,
+      Val* in1,
+      Val* in2,
+      Val* in3);
 
   Val* out() const {
     return out_;
@@ -416,9 +409,13 @@ class TORCH_CUDA_API TernaryOp : public Expr {
 
 class TORCH_CUDA_API ReductionOp : public Expr {
  public:
-  ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in);
-
-  ReductionOp(const ReductionOp* src, IrCloner* ir_cloner);
+  ReductionOp(
+      Passkey,
+      BinaryOpType reduction_op_type,
+      Val* init,
+      Val* out,
+      Val* in,
+      Bool* pred = nullptr);
 
   Val* out() const {
     return out_;
@@ -432,6 +429,10 @@ class TORCH_CUDA_API ReductionOp : public Expr {
     return init_;
   }
 
+  Bool* pred() const {
+    return pred_;
+  }
+
   BinaryOpType getReductionOpType() const {
     return reduction_op_type_;
   }
@@ -447,20 +448,20 @@ class TORCH_CUDA_API ReductionOp : public Expr {
   Val* const init_ = nullptr;
   Val* const out_ = nullptr;
   Val* const in_ = nullptr;
+  Bool* const pred_ = nullptr;
 };
 
 class TORCH_CUDA_API TensorIndex : public Val {
  public:
-  TensorIndex(const fuser::TensorView* view, std::vector<Val*> indices);
-
-  TensorIndex(const TensorIndex* src, IrCloner* ir_cloner);
+  TensorIndex(
+      Passkey,
+      const fuser::TensorView* view,
+      std::vector<Val*> indices);
 
   std::vector<Val*>::size_type nDims() const {
     return indices_.size();
   }
 
-  // i here is int, as we want to accept negative value and ::size_type can be a
-  // uint.
   Val* index(int i) const;
 
   const std::vector<Val*>& indices() const {
@@ -478,9 +479,7 @@ class TORCH_CUDA_API TensorIndex : public Val {
 
 class TORCH_CUDA_API BroadcastOp : public Expr {
  public:
-  BroadcastOp(Val* out, Val* in);
-
-  BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner);
+  BroadcastOp(Passkey, Val* out, Val* in);
 
   Val* out() const {
     return out_;
@@ -505,11 +504,11 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
 class TORCH_CUDA_API Allocate : public Expr {
  public:
   explicit Allocate(
+      Passkey,
       Val* buffer,
       MemoryType memory_type = MemoryType::Local,
-      Val* size = nullptr);
-
-  Allocate(const Allocate* src, IrCloner* ir_cloner);
+      Val* size = nullptr,
+      bool zero_init = false);
 
   Val* buffer() const {
     return buffer_;
@@ -523,6 +522,10 @@ class TORCH_CUDA_API Allocate : public Expr {
     return size_;
   }
 
+  bool zeroInit() const {
+    return zero_init_;
+  }
+
   DataType buffer_type() const {
     return buffer_->getDataType().value();
   }
@@ -531,19 +534,27 @@ class TORCH_CUDA_API Allocate : public Expr {
   Val* buffer_ = nullptr;
   MemoryType memory_type_ = MemoryType::Local;
   Val* size_ = nullptr;
+  bool zero_init_ = false;
 };
 
 // Sync represents __syncthreads barrier for block level coordination.
 class TORCH_CUDA_API Sync : public Expr {
  public:
-  Sync();
-  Sync(const Sync* src, IrCloner* ir_cloner);
+  explicit Sync(Passkey, bool war_sync = false);
+
+  bool isWarHazardSync() const {
+    return war_sync_;
+  }
+
+ private:
+  // TODO: war_sync_ is only used for testing/validation purposes.
+  bool war_sync_ = false;
 };
 
+// TODO(kir): promote to IR node
 class TORCH_CUDA_API Scope {
  public:
   Scope() = default;
-  Scope(const Scope* src, IrCloner* ir_cloner);
 
   const std::vector<Expr*>& exprs() const {
     return exprs_;
@@ -597,15 +608,12 @@ class TORCH_CUDA_API Scope {
 // in its body are considered inside the scope of the for loop. In the future
 // the implementation should look quite different so that we can do proper
 // dependency annalysis like in Fusion.
+//
+// TODO(kir): this is not a real expression
+//
 class TORCH_CUDA_API ForLoop : public Expr {
  public:
-  explicit ForLoop(
-      Val* index,
-      IterDomain* iter_domain,
-      const std::vector<Expr*>& body = {},
-      Expr* parent_scope = nullptr);
-
-  ForLoop(const ForLoop* src, IrCloner* ir_cloner);
+  ForLoop(Passkey, Val* index, IterDomain* iter_domain, Expr* parent_scope);
 
   Val* index() const {
     return index_;
@@ -619,7 +627,7 @@ class TORCH_CUDA_API ForLoop : public Expr {
     return body_;
   }
 
-  const Scope& constBody() const {
+  const Scope& body() const {
     return body_;
   }
 
@@ -640,33 +648,29 @@ class TORCH_CUDA_API ForLoop : public Expr {
 // are considered inside the scope of the if statement. In the future the
 // implementation should look quite different so that we can do proper
 // dependency annalysis like in Fusion.
+//
+// TODO(kir): this is not a real expression
+//
 class TORCH_CUDA_API IfThenElse : public Expr {
  public:
-  explicit IfThenElse(
-      Bool* cond,
-      const std::vector<Expr*>& if_body = {},
-      const std::vector<Expr*>& else_body = {},
-      Expr* parent_scope = nullptr);
-
-  IfThenElse(const IfThenElse* src, IrCloner* ir_cloner);
+  explicit IfThenElse(Passkey, Bool* cond, Expr* parent_scope);
 
   Bool* cond() const {
     return cond_;
   }
 
-  const Scope& constBody() const {
-    return body_;
+  Scope& thenBody() {
+    return then_body_;
   }
-
-  const Scope& constElseBody() const {
-    return else_body_;
+  const Scope& thenBody() const {
+    return then_body_;
   }
 
-  Scope& body() {
-    return body_;
+  Scope& elseBody() {
+    return else_body_;
   }
 
-  Scope& elseBody() {
+  const Scope& elseBody() const {
     return else_body_;
   }
 
@@ -682,7 +686,7 @@ class TORCH_CUDA_API IfThenElse : public Expr {
 
  private:
   Bool* const cond_ = nullptr;
-  Scope body_;
+  Scope then_body_;
   Scope else_body_;
   Expr* parent_scope_ = nullptr;
 };
@@ -693,14 +697,14 @@ class TORCH_CUDA_API IfThenElse : public Expr {
 // reduction and sync buffers.
 class TORCH_CUDA_API GridReduction : public Expr {
  public:
-  explicit GridReduction(ReductionOp* reduction_op);
+  explicit GridReduction(Passkey, ReductionOp* reduction_op);
 
   GridReduction(
+      Passkey,
       ReductionOp* reduction_op,
       Allocate* reduction_buffer,
-      Allocate* sync_buffer);
-
-  GridReduction(const GridReduction* src, IrCloner* ir_cloner);
+      Allocate* sync_buffer,
+      Bool* pred = nullptr);
 
   ReductionOp* reduction_op() const {
     return reduction_op_;
@@ -714,6 +718,10 @@ class TORCH_CUDA_API GridReduction : public Expr {
     return sync_buffer_;
   }
 
+  Bool* pred() const {
+    return pred_;
+  }
+
   static std::string getPredicateFlagName(const TensorView* val);
   static std::string getPredicateFlagName(const fuser::TensorView* val);
 
@@ -721,26 +729,9 @@ class TORCH_CUDA_API GridReduction : public Expr {
   ReductionOp* reduction_op_ = nullptr;
   Allocate* reduction_buffer_ = nullptr;
   Allocate* sync_buffer_ = nullptr;
+  Bool* pred_ = nullptr;
 };
 
-// Simple classification helpers
-bool isLoweredScalar(const Val* val);
-bool isLoweredVal(const Val* val);
-
-// Converts a Fusion IR value into the Kernel IR equivalent
-Val* lowerValue(const Val* val);
-
-// A minimal builder interface
-Val* andExpr(Val* lhs, Val* rhs);
-Val* eqExpr(Val* lhs, Val* rhs);
-Val* ltExpr(Val* lhs, Val* rhs);
-Val* addExpr(Val* lhs, Val* rhs);
-Val* subExpr(Val* lhs, Val* rhs);
-Val* mulExpr(Val* lhs, Val* rhs);
-Val* divExpr(Val* lhs, Val* rhs);
-Val* ceilDivExpr(Val* lhs, Val* rhs);
-Val* modExpr(Val* lhs, Val* rhs);
-
 } // namespace kir
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
new file mode 100644
index 000000000000..84fb818891f6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
@@ -0,0 +1,104 @@
+
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace kir {
+
+bool isLoweredScalar(const Val* val) {
+  switch (val->getValType().value()) {
+    case ValType::KirNamedScalar:
+    case ValType::KirScalar:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isLoweredVal(const Val* val) {
+  switch (val->getValType().value()) {
+    case ValType::TensorIndex:
+    case ValType::KirNamedScalar:
+    case ValType::KirScalar:
+    case ValType::KirTensorDomain:
+    case ValType::KirIterDomain:
+    case ValType::KirTensorView:
+      return true;
+    default:
+      return false;
+  }
+}
+
+Val* IrBuilder::newResult(const Val* lhs, const Val* rhs) {
+  TORCH_CHECK(isLoweredScalar(lhs));
+  TORCH_CHECK(isLoweredScalar(rhs));
+  TORCH_CHECK(lhs->getDataType() == rhs->getDataType());
+
+  // Allocate a compatible result value
+  switch (lhs->getDataType().value()) {
+    case DataType::Bool:
+      return create<Bool>(c10::nullopt);
+    case DataType::Float:
+      return create<Float>(c10::nullopt);
+    case DataType::Half:
+      return create<Half>(c10::nullopt);
+    case DataType::Int:
+      return create<Int>(c10::nullopt);
+    default:
+      TORCH_CHECK(false, "Unexpected data type");
+  }
+}
+
+Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  auto result = newResult(lhs, rhs);
+  create<BinaryOp>(op_type, result, lhs, rhs);
+  return result;
+}
+
+Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  auto result = create<Bool>(c10::nullopt);
+  create<BinaryOp>(op_type, result, lhs, rhs);
+  return result;
+}
+
+Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::And, lhs, rhs);
+}
+
+Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
+}
+
+Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
+}
+
+Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
+}
+
+Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
+}
+
+Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
+}
+
+Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
+}
+
+Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
+}
+
+Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
+}
+
+} // namespace kir
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
new file mode 100644
index 000000000000..bed780edcc65
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
@@ -0,0 +1,81 @@
+
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+
+#include <memory>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace kir {
+
+// Simple classification helpers
+bool isLoweredScalar(const Val* val);
+bool isLoweredVal(const Val* val);
+
+//! Kernel IR builder interface
+//!
+//! The only way to create new Kernel IR nodes is through the
+//! kir::IrBuilder interface. An IrBuilder instance is attached to a
+//! particular Kernel instance and it provides methods for creating
+//! single nodes (kir::IrBuilder::create()) or basic composite expressions
+//! (ex. kir::IrBuilder::addExpr()).
+//!
+//! If the Kernel object is readily available, an IrBuilder can be "wrapped"
+//! around it directly:
+//!
+//!   kir::IrBuilder ir_builder(kernel);
+//!
+//! During lowering, another option is to create an IrBuilder for the
+//! kernel that is being created:
+//!
+//!   kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+//!
+//! Once we have an IR builder instance, creating nodes looks like:
+//!
+//!   auto new_node = ir_builder.create<kir::Int>(1));
+//!   auto result = ir_builder.mulExpr(lhs, rhs);
+//!
+class IrBuilder {
+ public:
+  explicit IrBuilder(Kernel* kernel) : kernel_(kernel) {}
+
+  //! Allocate a new Kernel IR node, forwarding the arguments
+  //! to the appropriate constructor
+  template <class T, class... Args>
+  T* create(Args&&... args) {
+    // TODO(kir): switch this to Kernel registration
+    return new T(kir::Passkey(), std::forward<Args>(args)...);
+  }
+
+  // Binary expressions
+  Val* andExpr(Val* lhs, Val* rhs);
+  Val* eqExpr(Val* lhs, Val* rhs);
+  Val* ltExpr(Val* lhs, Val* rhs);
+  Val* addExpr(Val* lhs, Val* rhs);
+  Val* subExpr(Val* lhs, Val* rhs);
+  Val* mulExpr(Val* lhs, Val* rhs);
+  Val* divExpr(Val* lhs, Val* rhs);
+  Val* ceilDivExpr(Val* lhs, Val* rhs);
+  Val* modExpr(Val* lhs, Val* rhs);
+
+ private:
+  Val* newResult(const Val* lhs, const Val* rhs);
+  Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+  Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+
+ private:
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-private-field"
+  // Non-owning pointer to the kernel to be modified
+  Kernel* kernel_ = nullptr;
+#pragma clang diagnostic pop
+};
+
+} // namespace kir
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
index cdc41ddab51c..d30eb3fcda52 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
@@ -149,6 +149,9 @@ static auto code_helper_funcs = R"(
 __device__ constexpr int ceilDiv(const int a, const int b) {
   return (a + b - 1) / b;
 }
+__device__ constexpr int alignBufferSize(const int buffer, const int size) {
+  return (buffer + (size-1)) & ~(size-1);
+}
 __device__ float clamp(const float x, const float minv, const float maxv) {
   return x < minv ? minv : (x > maxv ? maxv : x);
 }
@@ -195,9 +198,17 @@ static auto code_template_block_reduction = R"(
 // may actually be slower.
 template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func>
 __inline__ __device__
-void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_idx, const dim3& block_dim, T* shared_mem) {
-
-  unsigned int reduction_size 
+void blockReduce(
+    T& out,
+    const T inp_val,
+    Func reduction_op,
+    const dim3& thread_idx,
+    const dim3& block_dim,
+    T* shared_mem,
+    bool read_write_pred,
+    T init_val) {
+
+  unsigned int reduction_size
     = (X_REDUCE ? block_dim.x : 1)
     * (Y_REDUCE ? block_dim.y : 1)
     * (Z_REDUCE ? block_dim.z : 1);
@@ -223,8 +234,8 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
     reduction_tid = threadIdx.z * blockDim.x + threadIdx.x;
   } else {
     // Normal reduction in order
-    reduction_stride 
-    = (X_REDUCE ? 1 
+    reduction_stride
+    = (X_REDUCE ? 1
     : (Y_REDUCE ? block_dim.x
     : (Z_REDUCE ? block_dim.x * block_dim.y : 0)));
 
@@ -238,7 +249,11 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
 
   assert( reduction_stride != 0 );
 
-  shared_mem[linear_tid] = inp_val;
+  if(read_write_pred){
+    shared_mem[linear_tid] = inp_val;
+  } else {
+    shared_mem[linear_tid] = init_val;
+  }
   __syncthreads();
   // Reduce down to nearest power of 2:
   int np2 =  1 << (31 - __clz(reduction_size));
@@ -256,9 +271,10 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
     }
     __syncthreads();
   }
-  if(should_write)
+
+  if(should_write && read_write_pred)
     out = shared_mem[linear_tid];
-  
+
 }
 )";
 
@@ -434,13 +450,20 @@ __host__ __device__ int offset_in_reduction_block(const dim3& thread_idx,
 */
 template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD,
           typename T, typename Func>
-__device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
-                                    Func reduction_op, T* shared_buf) {
+__device__ void gridReduceLastBlock(
+      T& out,
+      const T *in,
+      const size_t in_size,
+      Func reduction_op,
+      T* shared_buf,
+      bool read_write_pred,
+      T init_val) {
+        
   const int tid = ioffset(threadIdx, blockDim);
   const int block_size = isize(blockDim);
   const int rblock_size = size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
 
-  T inp = 0;
+  T inp = init_val;
   if (tid < in_size) {
     inp = in[tid];
   }
@@ -461,7 +484,7 @@ __device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
         inp, inp, reduction_op,
         dim3{(unsigned)rblock_offset, (unsigned)rblock_idx, 0},
         dim3{(unsigned)rblock_size, (unsigned)rem_size},
-        shared_buf);
+        shared_buf, true, init_val);
     __syncthreads();
     if (tid < rblock_size) {
       shared_buf[tid] = inp;
@@ -473,7 +496,7 @@ __device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
     }
   }
 
-  if (should_write) {
+  if (should_write && read_write_pred) {
     out = inp;
   }
 }
@@ -527,15 +550,22 @@ template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK,
 __device__ bool gridReduce(T& out, T inp_val, Func reduction_op,
                            volatile T* work_buf,
                            Tensor<int64_t, 1> sync_flags,
-                           T* shared_buf) {
+                           T* shared_buf, bool read_write_pred, T init_val) {
+
+  // Number of values to reduce in the grid dimensions
   const auto seg_size =
       size_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
+
+  // Index of the reduction we're performing out of the seg_size
   const auto seg_idx =
       index_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+
+  // Number of threads we can use in final reduction, Seems to assume all threads in the block participate
   const auto rblock_size =
       size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
 
   // advance to the offset for this segment
+  // index of reduction * size of the reduction * size of threads
   work_buf += seg_idx * seg_size * rblock_size;
 
   if ((X_THREAD || threadIdx.x == 0) &&
@@ -546,25 +576,33 @@ __device__ bool gridReduce(T& out, T inp_val, Func reduction_op,
     auto thread_offset =
         offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(threadIdx, blockDim);
     auto work_buf_offset = rblock_size * rblock_offset + thread_offset;
-    work_buf[work_buf_offset] = inp_val;
+    if(read_write_pred){
+      work_buf[work_buf_offset] = inp_val;
+    } else {
+      work_buf[work_buf_offset] = init_val;
+    }
   }
   __syncthreads();
 
   __shared__ bool last_block;
   if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
     __threadfence();
-    auto old = atomicAdd(  (unsigned long long*) &sync_flags[seg_idx], 1);
+    // printf("%ld\n", sync_flags[seg_idx]);
+    auto old = (int64_t) atomicAdd(  (unsigned long long*) &sync_flags[seg_idx], 1);
     last_block = old + 1 == seg_size;
+    // printf("Last_block = %d + 1 == %d\n", (int)old, (int)seg_size);
   }
   __syncthreads();
 
   if (last_block) {
+    // printf("Last block %d %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
     // final reduction
     gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
         out, (T*)work_buf, seg_size * rblock_size,
-        reduction_op, shared_buf);
+        reduction_op, shared_buf, read_write_pred, init_val);
     return true;
   } else {
+    // printf("Not last block %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
     return false;
   }
 }
@@ -595,10 +633,7 @@ __host__ __device__ unsigned offset_of_source(const dim3& block_dim, const dim3&
     out: Per-thread output location
  */
 template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
-__device__ void blockBroadcast(T& out, T inp_val) {
-
-  // Use worst case for memory.
-  __shared__ T shared_mem[1024];
+  __device__ void blockBroadcast(T& out, T inp_val, T* shared_mem) {
 
   const bool has_valid_data =
       (!X_THREAD || threadIdx.x == 0) &&
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
index 94ac287722bb..4e9d2ec499bf 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -1,9 +1,10 @@
 
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
+#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
@@ -14,66 +15,14 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-namespace {
-
 // TODO(kir): revisit this
 thread_local GpuLower* active_gpu_lower = nullptr;
 
-class GridReductionBuffers : OptOutDispatch {
- public:
-  static std::vector<kir::Allocate*> getGlobalAllocs(
-      const std::vector<Expr*>& exprs) {
-    GridReductionBuffers fgr;
-    for (auto expr : exprs) {
-      fgr.handle(expr);
-    }
-    return fgr.global_allocations_;
-  }
-
-  static std::vector<kir::Allocate*> getSyncAllocs(
-      const std::vector<Expr*>& exprs) {
-    GridReductionBuffers fgr;
-    for (auto expr : exprs) {
-      fgr.handle(expr);
-    }
-    return fgr.sync_allocations_;
-  }
-
- private:
-  std::vector<kir::Allocate*> global_allocations_;
-  std::vector<kir::Allocate*> sync_allocations_;
-
-  GridReductionBuffers() = default;
-
-  void handle(Expr* expr) final {
-    OptOutDispatch::handle(expr);
-  }
-
-  void handle(kir::ForLoop* fl) final {
-    for (auto expr : fl->body().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-  }
-
-  void handle(kir::IfThenElse* ite) final {
-    for (auto expr : ite->body().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-
-    for (auto expr : ite->elseBody().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-  }
-
-  void handle(kir::GridReduction* gr) final {
-    global_allocations_.push_back(gr->reduction_buffer());
-    sync_allocations_.push_back(gr->sync_buffer());
-  }
-};
+void GpuLower::replaceSymbolicSizes() {
+  FUSER_PERF_SCOPE("replaceSymbolicSizes");
 
-} // namespace
+  kir::IrBuilder ir_builder(kernel());
 
-void GpuLower::buildSizesMap() {
   // Grab inputs and outputs
   // TODO: Only run through inputs for the size map, outputs don't actually set
   // any sizes of the problem.
@@ -101,10 +50,9 @@ void GpuLower::buildSizesMap() {
 
     size_t dim = 0;
     for (auto id : root_td) {
-      // Output sizes could have reduction axes, which isn't what gets output.
-
-      Val* orig_size = id->extent();
+      const Val* orig_size = id->extent();
 
+      // Output sizes could have reduction axes, which isn't what gets output.
       if (id->isReduction()) {
         continue;
       } else if (id->getIterType() == IterType::BroadcastWithoutStride) {
@@ -117,31 +65,21 @@ void GpuLower::buildSizesMap() {
         continue;
       }
 
+      // TODO(kir): consider a different implementation which doesn't
+      //  hijack the kir_map_
       if (kir_map_.find(orig_size) == kir_map_.end()) {
         std::stringstream ss;
         ss << "T" << tv->name() << ".size[" << dim++ << "]";
-        auto new_size =
-            new kir::NamedScalar(ss.str(), orig_size->getDataType().value());
-        kir_map_[orig_size] = new_size;
-      }
-    }
-  }
-}
-
-void GpuLower::adjustMemoryTypes() {
-  for (auto val : fusion_->deterministic_vals()) {
-    if (ir_utils::isTV(val)) {
-      auto tv = val->as<TensorView>();
-      if (fusion_->hasInput(tv) || fusion_->hasOutput(tv)) {
-        tv->setMemoryType(MemoryType::Global);
-      } else if (tv->getMemoryType() == MemoryType::Global) {
-        tv->setMemoryType(MemoryType::Local);
+        kir_map_[orig_size] = ir_builder.create<kir::NamedScalar>(
+            ss.str(), orig_size->getDataType().value());
       }
     }
   }
 }
 
 void GpuLower::lower() {
+  FUSER_PERF_SCOPE("lower");
+
   TORCH_INTERNAL_ASSERT(fusion_ != nullptr);
   TORCH_INTERNAL_ASSERT(
       active_gpu_lower == nullptr, "Nested lowering passes are not supported");
@@ -158,68 +96,54 @@ void GpuLower::lower() {
 
   FusionGuard fg(fusion_);
 
+  // Start with a fresh kernel
+  kernel_ = std::make_unique<Kernel>();
+
   // prepare for lowering
   validateIr(fusion_);
-  buildSizesMap();
-  adjustMemoryTypes();
+  replaceSymbolicSizes();
 
   // Compute thread predicates
   ThreadPredicateMap preds(fusion_);
 
-  // Run our passes keeping the lowered expressions and forwarding
-  // them.
+  // Run our passes keeping the lowered expressions and forwarding them
   const auto lowered_exprs =
       LoopNestGenerator::loweredExprs(fusion_, preds, fusion_->exprs(true));
 
   const auto unrolled_loops =
       UnrollPass::runPass(fusion_, lowered_exprs, preds);
 
-  const auto indexed_loops =
-      IndexLowering::getIndexedExprs(fusion_, unrolled_loops);
-
-  // Store the final lowered IR
-  lowered_exprs_ = indexed_loops;
+  // Insert SyncThreads at end of for-loop to avoid WAR race condition
+  const auto sync_exprs = insertThreadSynchronization(fusion_, unrolled_loops);
 
-  // Get allocations
-  global_allocations_ = GridReductionBuffers::getGlobalAllocs(lowered_exprs_);
-  sync_allocations_ = GridReductionBuffers::getSyncAllocs(lowered_exprs_);
-}
+  const auto indexed_loops =
+      IndexLowering::getIndexedExprs(fusion_, sync_exprs);
 
-// Traverse through the fusion and print CUDA code associated with it
-std::ostream& GpuLower::printKernel(
-    std::ostream& os,
-    const std::string& kernel_name) {
-  FusionGuard fg(fusion_);
+  // We now have the lowered expressions, finalize the kernel IR
+  kernel_->finalize(indexed_loops, preds);
 
-  std::vector<kir::Allocate*> allocs;
-  allocs.insert(
-      allocs.end(), global_allocations_.begin(), global_allocations_.end());
-  allocs.insert(
-      allocs.end(), sync_allocations_.begin(), sync_allocations_.end());
-
-  std::vector<Val*> global_tensors(allocs.size(), nullptr);
-  std::transform(
-      allocs.begin(),
-      allocs.end(),
-      global_tensors.begin(),
-      [](kir::Allocate* alloc) { return alloc->buffer(); });
-
-  IRPrinter irp(os);
-  irp.printKernel(lowered_exprs_, kernel_name, global_tensors);
-  return os;
+  // Set the kernel inputs & outputs
+  for (auto input : fusion_->inputs()) {
+    kernel_->addInput(GpuLower::lowerValue(input));
+  }
+  for (auto output : fusion_->outputs()) {
+    kernel_->addOutput(GpuLower::lowerValue(output));
+  }
 }
 
-std::string GpuLower::getKernel(const std::string& kernel_name) {
-  std::stringstream ss;
-  printKernel(ss, kernel_name);
-  return ss.str();
+Kernel* GpuLower::kernel() const {
+  TORCH_CHECK(kernel_);
+  return kernel_.get();
 }
 
 // Maps Fusion IR nodes to the Kernel IR counterparts
-// (this is a interim solution for easing the Kernel IR splitting)
+//
+// TODO(kir): this is a interim solution for easing the Kernel IR splitting
+//
 class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
  public:
-  explicit KernelIrMapper(GpuLower* gpu_lower) : gpu_lower_(gpu_lower) {}
+  explicit KernelIrMapper(GpuLower* gpu_lower)
+      : gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {}
 
   Val* lower(const Val* value) {
     const auto it = gpu_lower_->kir_map_.find(value);
@@ -248,12 +172,13 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
     switch (def->type()) {
       case ExprType::UnaryOp: {
         const auto op = def->as<fuser::UnaryOp>();
-        new kir::UnaryOp(op->getUnaryOpType(), lowered_value, lower(op->in()));
+        ir_builder_.create<kir::UnaryOp>(
+            op->getUnaryOpType(), lowered_value, lower(op->in()));
         break;
       }
       case ExprType::BinaryOp: {
         const auto op = def->as<fuser::BinaryOp>();
-        new kir::BinaryOp(
+        ir_builder_.create<kir::BinaryOp>(
             op->getBinaryOpType(),
             lowered_value,
             lower(op->lhs()),
@@ -262,7 +187,7 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
       }
       case ExprType::TernaryOp: {
         const auto op = def->as<fuser::TernaryOp>();
-        new kir::TernaryOp(
+        ir_builder_.create<kir::TernaryOp>(
             op->getTernaryOpType(),
             lowered_value,
             lower(op->in1()),
@@ -288,56 +213,68 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
   }
 
   void handle(const TensorDomain* node) override {
-    const auto lowered_node = new kir::TensorDomain(node);
+    const auto lowered_node = ir_builder_.create<kir::TensorDomain>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const IterDomain* node) override {
-    const auto lowered_node = new kir::IterDomain(node);
+    const auto lowered_node = ir_builder_.create<kir::IterDomain>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const TensorView* node) override {
-    const auto lowered_node = new kir::TensorView(node);
+    const auto lowered_node = ir_builder_.create<kir::TensorView>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Bool* node) override {
-    const auto lowered_node = new kir::Bool(node);
+    const auto lowered_node = ir_builder_.create<kir::Bool>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Float* node) override {
-    const auto lowered_node = new kir::Float(node);
+    const auto lowered_node = ir_builder_.create<kir::Float>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Half* node) override {
-    const auto lowered_node = new kir::Half(node);
+    const auto lowered_node = ir_builder_.create<kir::Half>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Int* node) override {
-    const auto lowered_node = new kir::Int(node, false);
+    const auto lowered_node = ir_builder_.create<kir::Int>(node, false);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const NamedScalar* node) override {
-    const auto lowered_node =
-        new kir::NamedScalar(node->name(), node->getDataType().value());
+    const auto lowered_node = ir_builder_.create<kir::NamedScalar>(
+        node->name(), node->getDataType().value());
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
  private:
   GpuLower* gpu_lower_ = nullptr;
+  kir::IrBuilder ir_builder_;
 };
 
 Val* GpuLower::lowerValue(const Val* val) {
+  TORCH_INTERNAL_ASSERT(!kir::isLoweredVal(val));
   TORCH_INTERNAL_ASSERT(active_gpu_lower != nullptr);
   KernelIrMapper kir_mapper(active_gpu_lower);
   return kir_mapper.lower(val);
 }
 
+Val* GpuLower::getLowerValue(const Val* val) {
+  KernelIrMapper kir_mapper(this);
+  return kir_mapper.lower(val);
+}
+
+GpuLower* GpuLower::current() {
+  TORCH_INTERNAL_ASSERT(active_gpu_lower != nullptr);
+  return active_gpu_lower;
+}
+
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
index 4ffccba33339..1cc50fa20ab4 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.h
+++ b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -6,6 +7,7 @@
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
+#include <memory>
 #include <ostream>
 
 namespace torch {
@@ -22,20 +24,7 @@ class TORCH_CUDA_API GpuLower {
     lower();
   }
 
-  // print generated code to ostream
-  std::ostream& printKernel(
-      std::ostream& _os,
-      const std::string& kernel_name = "CUDAGeneratedKernel");
-
-  std::string getKernel(const std::string& kernel_name = "CUDAGeneratedKernel");
-
-  std::vector<kir::Allocate*> global_allocations() {
-    return global_allocations_;
-  }
-
-  std::vector<kir::Allocate*> sync_allocations() {
-    return sync_allocations_;
-  }
+  Kernel* kernel() const;
 
   // Converts a Fusion IR value into the Kernel IR equivalent
   //
@@ -43,6 +32,14 @@ class TORCH_CUDA_API GpuLower {
   //
   static Val* lowerValue(const Val* val);
 
+  // TODO(kir): we have two methods which do almost the same thing
+  //
+  Val* getLowerValue(const Val* val);
+
+  //! Returns the currently active lowering object
+  //! (or nullptr if no lowering is in progress)
+  static GpuLower* current();
+
  private:
   void lower();
 
@@ -52,21 +49,11 @@ class TORCH_CUDA_API GpuLower {
   // not have this information. Since we need to have the correct information in
   // the kernel being fetched for shapes, we want to replace input and output
   // tensors to reference the runtime structure containing sizes.
-  void buildSizesMap();
-
-  // Adjust memory types to make sure they are valid
-  void adjustMemoryTypes();
+  void replaceSymbolicSizes();
 
  private:
-  // List of global buffers (not including buffers for grid syncronization)
-  std::vector<kir::Allocate*> global_allocations_;
-
-  // List of syncronization buffers that must be initialized to 0 when running
-  // the fusion
-  std::vector<kir::Allocate*> sync_allocations_;
-
-  // Lowered IR
-  std::vector<Expr*> lowered_exprs_;
+  // Lowered Kernel IR
+  std::unique_ptr<Kernel> kernel_;
 
   // Fusion IR node to Kernel IR node mapping
   std::unordered_map<const Val*, Val*> kir_map_;
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
index 443a718cb014..5dcefda05f48 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -1,7 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
 
@@ -9,6 +13,8 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+IndexLowering::IndexLowering() : ir_builder_(GpuLower::current()->kernel()) {}
+
 Val* IndexLowering::lowerOperand(Val* op, Val* out) const {
   if (ir_utils::isTV(op)) {
     return Index::getProducerIndex(
@@ -16,7 +22,7 @@ Val* IndexLowering::lowerOperand(Val* op, Val* out) const {
         ir_utils::asTV(out),
         scope_utils::getLoops(active_scope_expr));
   } else {
-    return kir::lowerValue(op);
+    return GpuLower::lowerValue(op);
   }
 }
 
@@ -27,27 +33,29 @@ Val* IndexLowering::lowerOutput(Expr* expr) const {
     return Index::getConsumerIndex(
         ir_utils::asTV(out), scope_utils::getLoops(active_scope_expr));
   } else {
-    return kir::lowerValue(out);
+    return GpuLower::lowerValue(out);
   }
 }
 
 void IndexLowering::pushBack(Expr* expr) {
-  if (active_scope == nullptr)
+  if (active_scope == nullptr) {
     lowered_exprs.push_back(expr);
-  else
+  } else {
     active_scope->push_back(expr);
+  }
 }
 
 void IndexLowering::handle(kir::IfThenElse* ite) {
   Expr* prev_scope_expr = active_scope_expr;
   kir::Scope* prev_scope = active_scope;
 
-  auto new_ite = new kir::IfThenElse(ite->cond(), {}, {}, prev_scope_expr);
+  auto new_ite =
+      ir_builder_.create<kir::IfThenElse>(ite->cond(), prev_scope_expr);
   pushBack(new_ite);
   active_scope_expr = new_ite;
-  active_scope = &new_ite->body();
+  active_scope = &new_ite->thenBody();
 
-  for (auto expr : ite->body().exprs()) {
+  for (auto expr : ite->thenBody().exprs()) {
     OptInDispatch::handle(expr);
   }
 
@@ -65,8 +73,8 @@ void IndexLowering::handle(kir::ForLoop* fl) {
   Expr* prev_scope_expr = active_scope_expr;
   kir::Scope* prev_scope = active_scope;
 
-  auto newFl =
-      new kir::ForLoop(fl->index(), fl->iter_domain(), {}, prev_scope_expr);
+  auto newFl = ir_builder_.create<kir::ForLoop>(
+      fl->index(), fl->iter_domain(), prev_scope_expr);
   pushBack(newFl);
 
   active_scope_expr = newFl;
@@ -81,52 +89,52 @@ void IndexLowering::handle(kir::ForLoop* fl) {
 }
 
 void IndexLowering::handle(UnaryOp* uop) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(uop)) {
-    pushBack(uop);
-    return;
+  if (ir_utils::isTVOp(uop)) {
+    const auto in = lowerOperand(uop->in(), uop->out());
+    const auto out = lowerOutput(uop);
+    pushBack(ir_builder_.create<kir::UnaryOp>(uop->getUnaryOpType(), out, in));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(uop->out())->getOrigin());
   }
-
-  const auto in = lowerOperand(uop->in(), uop->out());
-  const auto out = lowerOutput(uop);
-  pushBack(new kir::UnaryOp(uop->getUnaryOpType(), out, in));
 }
 
 void IndexLowering::handle(BinaryOp* bop) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(bop)) {
-    pushBack(bop);
-    return;
+  if (ir_utils::isTVOp(bop)) {
+    const auto lhs = lowerOperand(bop->lhs(), bop->out());
+    const auto rhs = lowerOperand(bop->rhs(), bop->out());
+    const auto out = lowerOutput(bop);
+    pushBack(ir_builder_.create<kir::BinaryOp>(
+        bop->getBinaryOpType(), out, lhs, rhs));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(bop->out())->getOrigin());
   }
-
-  const auto lhs = lowerOperand(bop->lhs(), bop->out());
-  const auto rhs = lowerOperand(bop->rhs(), bop->out());
-  const auto out = lowerOutput(bop);
-  pushBack(new kir::BinaryOp(bop->getBinaryOpType(), out, lhs, rhs));
 }
 
 void IndexLowering::handle(TernaryOp* top) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(top)) {
-    pushBack(top);
-    return;
+  if (ir_utils::isTVOp(top)) {
+    const auto in1 = lowerOperand(top->in1(), top->out());
+    const auto in2 = lowerOperand(top->in2(), top->out());
+    const auto in3 = lowerOperand(top->in3(), top->out());
+    const auto out = lowerOutput(top);
+    pushBack(ir_builder_.create<kir::TernaryOp>(
+        top->getTernaryOpType(), out, in1, in2, in3));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(top->out())->getOrigin());
   }
-
-  const auto in1 = lowerOperand(top->in1(), top->out());
-  const auto in2 = lowerOperand(top->in2(), top->out());
-  const auto in3 = lowerOperand(top->in3(), top->out());
-  const auto out = lowerOutput(top);
-  pushBack(new kir::TernaryOp(top->getTernaryOpType(), out, in1, in2, in3));
 }
 
 namespace {
 
 void allocateGridReductionFlag(TensorView* out_tv, Expr* current_scope_expr) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
   auto flag_name = kir::GridReduction::getPredicateFlagName(out_tv);
-  auto flag_var = new kir::Allocate(
-      new kir::NamedScalar(flag_name, DataType::Bool),
+  auto flag_var = ir_builder.create<kir::Allocate>(
+      ir_builder.create<kir::NamedScalar>(flag_name, DataType::Bool),
       MemoryType::Local,
-      new kir::Int(1));
+      ir_builder.create<kir::Int>(1));
   // When enclosed by IfThenElse, place the variable outside of the
   // IfThenElse. This IfThenElse is assumed to be the prediate for
   // this grid reduction expression.
@@ -174,8 +182,15 @@ void IndexLowering::handle(ReductionOp* rop) {
 
   kir::ReductionOp* block_reduction_op = nullptr;
   if (is_block_reduce) {
-    block_reduction_op = new kir::ReductionOp(
-        rop->getReductionOpType(), kir::lowerValue(rop->init()), out, in);
+    auto pred =
+        PredicateCompute::getInlinePredicate(rop, loops, nullptr, false);
+
+    block_reduction_op = ir_builder_.create<kir::ReductionOp>(
+        rop->getReductionOpType(),
+        GpuLower::lowerValue(rop->init()),
+        out,
+        in,
+        pred);
     pushBack(block_reduction_op);
   }
 
@@ -217,23 +232,34 @@ void IndexLowering::handle(ReductionOp* rop) {
 
     IterDomain* buffer_id = new IterDomain(new Int(0), buffer_size);
     TensorView* reduce_buffer_tv = new TensorView(
-        new TensorDomain({buffer_id}), out->getDataType().value());
+        new TensorDomain({buffer_id}),
+        out->getDataType().value(),
+        MemoryType::Global);
 
     IterDomain* sync_id = new IterDomain(new Int(0), sync_size);
-    TensorView* reduce_sync_tv =
-        new TensorView(new TensorDomain({sync_id}), DataType::Int);
-
-    const auto reduce_buffer = new kir::Allocate(
-        kir::lowerValue(reduce_buffer_tv), MemoryType::Global);
-    const auto sync_buffer =
-        new kir::Allocate(kir::lowerValue(reduce_sync_tv), MemoryType::Global);
+    TensorView* reduce_sync_tv = new TensorView(
+        new TensorDomain({sync_id}), DataType::Int, MemoryType::Global);
+
+    const auto reduce_buffer = ir_builder_.create<kir::Allocate>(
+        GpuLower::lowerValue(reduce_buffer_tv),
+        reduce_sync_tv->getMemoryType());
+    const auto sync_buffer = ir_builder_.create<kir::Allocate>(
+        GpuLower::lowerValue(reduce_sync_tv),
+        reduce_sync_tv->getMemoryType(),
+        nullptr,
+        true);
 
     const auto grid_reduction_op = block_reduction_op == nullptr
-        ? new kir::ReductionOp(
-              rop->getReductionOpType(), kir::lowerValue(rop->init()), out, in)
+        ? ir_builder_.create<kir::ReductionOp>(
+              rop->getReductionOpType(),
+              GpuLower::lowerValue(rop->init()),
+              out,
+              in)
         : block_reduction_op;
-    const auto grid_reduction =
-        new kir::GridReduction(grid_reduction_op, reduce_buffer, sync_buffer);
+    auto pred =
+        PredicateCompute::getInlinePredicate(rop, loops, nullptr, false);
+    const auto grid_reduction = ir_builder_.create<kir::GridReduction>(
+        grid_reduction_op, reduce_buffer, sync_buffer, pred);
 
     pushBack(reduce_buffer);
     pushBack(sync_buffer);
@@ -241,7 +267,8 @@ void IndexLowering::handle(ReductionOp* rop) {
   }
 
   if (!is_block_reduce && !is_grid_reduce) {
-    pushBack(new kir::BinaryOp(rop->getReductionOpType(), out, out, in));
+    pushBack(ir_builder_.create<kir::BinaryOp>(
+        rop->getReductionOpType(), out, out, in));
   }
 }
 
@@ -260,7 +287,7 @@ void IndexLowering::handle(BroadcastOp* bop) {
   if (ir_utils::isTV(in))
     in = Index::getProducerIndex(
         ir_utils::asTV(in), ir_utils::asTV(bop->out()), loops);
-  pushBack(new kir::BroadcastOp(out, in));
+  pushBack(ir_builder_.create<kir::BroadcastOp>(out, in));
 }
 
 void IndexLowering::handle(kir::Allocate* allocate) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h
index ea420abdf359..7e553f8013dc 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.h
+++ b/torch/csrc/jit/codegen/cuda/lower_index.h
@@ -2,7 +2,10 @@
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 
 #include <vector>
 
@@ -15,6 +18,7 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   static std::vector<Expr*> getIndexedExprs(
       Fusion* fusion,
       std::vector<Expr*> incoming_exprs) {
+    FUSER_PERF_SCOPE("IndexLowering::getIndexedExprs");
     FusionGuard fg(fusion);
     IndexLowering il;
     il.generate(incoming_exprs);
@@ -22,6 +26,8 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   }
 
  private:
+  IndexLowering();
+
   // Wrap pushBack, if active_scope is null we want it to go
   // straight to lower_exprs
   void pushBack(Expr*);
@@ -57,6 +63,8 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   // to understand the nesting of IfThenElse/ForLoop nodes.
   kir::Scope* active_scope = nullptr;
   Expr* active_scope_expr = nullptr;
+
+  kir::IrBuilder ir_builder_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
new file mode 100644
index 000000000000..71bf2a282fec
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
@@ -0,0 +1,227 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+
+namespace {
+
+class LocalSyncInserter final : private OptOutDispatch {
+ public:
+  static void InsertSyncs(Expr* expr) {
+    LocalSyncInserter sync_inserter;
+    sync_inserter.handle(expr);
+  }
+
+  void handle(Expr* expr) final {
+    if (ir_utils::isTVOp(expr)) {
+      // For this SyncInserter
+      (!initial_sync_) ? hasOutputSmemExpr(expr, initial_)
+                       : hasInputSmemExpr(expr, final_);
+
+      // For parent SyncInserter
+      hasOutputSmemExpr(expr, all_smem_outputs_);
+      hasInputSmemExpr(expr, all_smem_inputs_);
+    } else {
+      OptOutDispatch::handle(expr);
+    }
+  }
+
+  const std::unordered_set<const TensorView*>& initial() const {
+    return initial_;
+  }
+
+  const std::unordered_set<const TensorView*>& final() const {
+    return final_;
+  }
+
+  const std::unordered_set<const TensorView*>& all_smem_inputs() const {
+    return all_smem_inputs_;
+  }
+
+  const std::unordered_set<const TensorView*>& all_smem_outputs() const {
+    return all_smem_outputs_;
+  }
+
+ private:
+  void handle(kir::IfThenElse* ite) final {
+    for (auto expr : ite->thenBody().exprs()) {
+      handle(expr);
+    }
+    for (auto expr : ite->elseBody().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::ForLoop* fl) final {
+    // Track if last op in body is sync in nested for-loop
+    bool is_last_op_sync_ = false;
+    for (auto expr : fl->body().exprs()) {
+      is_last_op_sync_ = false;
+      if (expr->getExprType().value() == ExprType::Sync) {
+        initial_sync_ = true;
+        final_.clear();
+      } else if (expr->getExprType().value() == ExprType::ForLoop) {
+        // Recursively handle nested for-loop
+        LocalSyncInserter child_sync_inserter;
+        child_sync_inserter.handle(expr);
+        const auto& child_inputs = child_sync_inserter.all_smem_inputs();
+        const auto& child_outputs = child_sync_inserter.all_smem_outputs();
+
+        // Default - Track all smem inputs / outputs
+        all_smem_inputs_.insert(child_inputs.begin(), child_inputs.end());
+        all_smem_outputs_.insert(child_outputs.begin(), child_outputs.end());
+
+        if (!initial_sync_) {
+          // Parent - None
+          if (!child_sync_inserter.initial_sync_) {
+            // Child - None
+            // Append All Child Outputs to Parent Initial
+            initial_.insert(child_outputs.begin(), child_outputs.end());
+          } else if (child_sync_inserter.has_war_hazard_sync_) {
+            // Child - WAR race
+            // Parent first sync
+            // Inherit Child Initial / Clear Parent Final
+            initial_sync_ = true;
+            is_last_op_sync_ = true;
+            initial_.insert(
+                child_sync_inserter.initial().begin(),
+                child_sync_inserter.initial().end());
+            final_.clear();
+          } else {
+            // Child - 1+
+            // Parent first sync
+            // Inherit Child Initial + Final
+            initial_sync_ = true;
+            initial_.insert(
+                child_sync_inserter.initial().begin(),
+                child_sync_inserter.initial().end());
+            final_.insert(
+                child_sync_inserter.final().begin(),
+                child_sync_inserter.final().end());
+          }
+        } else {
+          // Parent - 1+
+          if (!child_sync_inserter.initial_sync_) {
+            // Child - None
+            // Append All Child to Parent Last
+            final_.insert(child_inputs.begin(), child_inputs.end());
+          } else if (child_sync_inserter.has_war_hazard_sync_) {
+            // Child - WAR race
+            // Clear Parent Last / Discard Child Initial
+            is_last_op_sync_ = true;
+            final_.clear();
+          } else {
+            // Child - 1+
+            // Inherit Child Final / Discard Child Initial
+            final_.insert(
+                child_sync_inserter.final().begin(),
+                child_sync_inserter.final().end());
+          }
+        }
+      } else {
+        handle(expr);
+      }
+    }
+
+    // This level of the nested for-loop may not exist in the kernel.
+    // However, subsequent levels can exist, so we handle the body of the
+    // for-loop first.
+    if (!fl->iter_domain()->isThread() && !fl->iter_domain()->isBroadcast()) {
+      // Determine if any smem TV is written to at beginning of the for-loop
+      // and whether that smem TV is read from at the end of the for-loop
+      // Insert new SyncThreads at end of for-loop to prevent WAR race condition
+      if (detect_intersection(initial_, final_) &&
+          fl->body().exprs().back()->getExprType().value() != ExprType::Sync &&
+          !is_last_op_sync_) {
+        // std::cout << "WAR race detected; Add Sync" << std::endl;
+        has_war_hazard_sync_ = true;
+        kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+        fl->body().push_back(ir_builder.create<kir::Sync>(true));
+      }
+    }
+  }
+
+  bool detect_intersection(
+      std::unordered_set<const TensorView*>& left,
+      std::unordered_set<const TensorView*>& right) {
+    for (auto item : left) {
+      if (right.find(item) != right.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void hasOutputSmemExpr(
+      Expr* expr,
+      std::unordered_set<const TensorView*>& set) {
+    for (auto out : expr->outputs()) {
+      if (ir_utils::isTV(out)) {
+        auto tv = out->as<TensorView>();
+        if (tv->getMemoryType() == MemoryType::Shared) {
+          set.insert(tv);
+        }
+      }
+    }
+  }
+
+  void hasInputSmemExpr(
+      Expr* expr,
+      std::unordered_set<const TensorView*>& set) {
+    for (auto inp : expr->inputs()) {
+      if (ir_utils::isTV(inp)) {
+        auto tv = inp->as<TensorView>();
+        if (tv->getMemoryType() == MemoryType::Shared) {
+          set.insert(tv);
+        }
+      }
+    }
+  }
+
+ private:
+  // Track Shared Memory Inputs (Reads) for parent for-loop
+  std::unordered_set<const TensorView*> all_smem_inputs_;
+
+  // Track Shared Memory Outputs (Writes) for parent for-loop
+  std::unordered_set<const TensorView*> all_smem_outputs_;
+
+  // Shared Memory Writes at beginning of the for-loop
+  // before first SyncThreads
+  std::unordered_set<const TensorView*> initial_;
+
+  // Shared Memory Reads at end of the for-loop
+  // Cleared after each SyncThreads
+  std::unordered_set<const TensorView*> final_;
+
+  // Track first sync found in for-loop
+  bool initial_sync_ = false;
+
+  // Track sync was inserted for war hazard
+  bool has_war_hazard_sync_ = false;
+};
+
+} // namespace
+
+std::vector<Expr*> insertThreadSynchronization(
+    Fusion* fusion,
+    const std::vector<Expr*>& exprs) {
+  FUSER_PERF_SCOPE("insertThreadSynchronization");
+  FusionGuard fg(fusion);
+  std::vector<Expr*> mutated_exprs;
+  for (auto expr : exprs) {
+    LocalSyncInserter::InsertSyncs(expr);
+    mutated_exprs.push_back(expr);
+  }
+  return mutated_exprs;
+}
+
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
new file mode 100644
index 000000000000..e17d536de575
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+
+// Insert sync at end of for-loops to prevent write-after-read race condition.
+// WAR race condition occurs when the next iteration of the loop overwrites
+// shared memory value before a previous operation has finished reading it.
+
+// WAR Race Check:
+// Track all output shared memory TVs before first sync
+// Track all input shared memory TVs after last sync
+// If the intersection is non-empty, then there is a WAR race condition.
+// Recursively check each nested for-loop
+
+// Parent-Child For-Loop Recursive Relationship
+// Notation:
+// None - Zero Syncs
+//   1+ - One or more Syncs
+//  End - Sync is last op in for-loop to prevent WAR race condition
+
+// Default: Track all shared memory inputs and outputs
+
+// Parent - None
+//  Child - None => Append All Child Outputs to Parent Initial
+//  Child - 1+ => Parent first sync => Inherit Child Initial + Final
+//  Child - End => Parent first sync => Keep Child Initial / Clear Parent Final
+
+// Parent - 1+
+//  Child - None => Append All Child to Parent Last
+//  Child - 1+ => Child Final to Parent Final / Discard Child Initial
+//  Child - End => Clear Parent Last / Discard Child Initial
+
+// If Child - End and Parent has zero remaining operations, then
+// Parent inherits Child End.
+
+std::vector<Expr*> insertThreadSynchronization(
+    Fusion* fusion,
+    const std::vector<Expr*>& exprs);
+
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
index fd7033e50016..97c3feb50723 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -1,15 +1,30 @@
+
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
+#include <algorithm>
 #include <numeric>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
+LoopNestGenerator::LoopNestGenerator(
+    Fusion* fusion,
+    ThreadPredicateMap& thread_predicates,
+    const std::vector<Expr*>& exprs)
+    : fusion_(fusion),
+      thread_predicates_(thread_predicates),
+      ir_builder_(GpuLower::current()->kernel()) {
+  generate(exprs);
+}
+
 // Create, place, and return the allocation for tv
 Expr* LoopNestGenerator::pushAlloc(TensorView* tv) {
   TORCH_INTERNAL_ASSERT(
@@ -40,26 +55,35 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) {
         local_dim->isBroadcast()) {
       continue;
     }
-    alloc_dims.push_back(compute_at_dim->extent());
+    alloc_dims.push_back(compute_at_dim->rawExtent());
   }
 
   // Multiply all the dimensions we're going to use for the allocation together
   // to get the total size
   Val* size = nullptr;
   if (alloc_dims.size() == 0) {
-    size = new kir::Int(1);
+    size = ir_builder_.create<kir::Int>(1);
   } else {
-    size = kir::lowerValue(alloc_dims[0]);
+    size = GpuLower::lowerValue(alloc_dims[0]);
     for (size_t i = 1; i < alloc_dims.size(); i++) {
-      size = kir::mulExpr(size, kir::lowerValue(alloc_dims[i]));
+      size = ir_builder_.mulExpr(size, GpuLower::lowerValue(alloc_dims[i]));
     }
   }
 
   // Create the allocation node
-  const auto lowered_tv = new kir::TensorView(tv);
-  const auto alloc =
-      new kir::Allocate(lowered_tv, lowered_tv->getMemoryType(), size);
+  const auto lowered_tv = ir_builder_.create<kir::TensorView>(tv);
+  const auto alloc = ir_builder_.create<kir::Allocate>(
+      lowered_tv, lowered_tv->memoryType(), size);
+
+  // Track Shared Memory Allocation Nodes
+  if (tv->getMemoryType() == MemoryType::Shared) {
+    if (!size->isConstScalar()) {
+      dynamic_smem_.push_front(alloc);
+      return nullptr;
+    }
+  }
 
+  // Place the allocation
   if (alloc_loop != nullptr) {
     alloc_loop->body().insert(0, alloc);
   } else {
@@ -90,10 +114,11 @@ void LoopNestGenerator::popFor() {
 }
 
 void LoopNestGenerator::pushBack(Expr* expr) {
-  if (for_loops.size() == 0)
+  if (for_loops.size() == 0) {
     lowered_exprs.push_back(expr);
-  else
+  } else {
     scope_utils::pushBack(for_loops.back(), expr);
+  }
 }
 
 // Update for loop structure based on this TensorView, if there's an allocation
@@ -116,7 +141,7 @@ void LoopNestGenerator::initReduction(
     IterDomain* dim = tv->getComputeAtAxis(i).first;
     if (dim->isReduction())
       continue;
-    ids.push_back(kir::lowerValue(dim)->as<kir::IterDomain>());
+    ids.push_back(GpuLower::lowerValue(dim)->as<kir::IterDomain>());
   }
 
   // Unsafe clone, as we want an exact replica of tv so we can create a UnaryOp
@@ -146,11 +171,14 @@ void LoopNestGenerator::initReduction(
       // If based on a thread, make sure we get the named Int right
       std::stringstream ss;
       ss << id->getParallelType();
-      new_fl = new kir::ForLoop(
-          new kir::NamedScalar(ss.str(), DataType::Int), id, {}, inner_fl);
+      new_fl = ir_builder_.create<kir::ForLoop>(
+          ir_builder_.create<kir::NamedScalar>(ss.str(), DataType::Int),
+          id,
+          inner_fl);
     } else {
       // Otherwise it's just a new int-
-      new_fl = new kir::ForLoop(new kir::Int(c10::nullopt), id, {}, inner_fl);
+      new_fl = ir_builder_.create<kir::ForLoop>(
+          ir_builder_.create<kir::Int>(c10::nullopt), id, inner_fl);
     }
 
     if (init_loop_nest == nullptr) {
@@ -213,8 +241,10 @@ void LoopNestGenerator::handle(Expr* expr) {
           " cannot lower ",
           out->getValType().value());
 
-      pushBack(new kir::Allocate(
-          kir::lowerValue(out), MemoryType::Local, new kir::Int(1)));
+      pushBack(ir_builder_.create<kir::Allocate>(
+          GpuLower::lowerValue(out),
+          MemoryType::Local,
+          ir_builder_.create<kir::Int>(1)));
     }
     pushBack(expr);
     return;
@@ -227,7 +257,7 @@ void LoopNestGenerator::handle(Expr* expr) {
   }
   if (shared_memory_sync) {
     // push Sync to the back of the last for loop
-    scope_utils::pushBack(for_loops.back(), new kir::Sync());
+    scope_utils::pushBack(for_loops.back(), ir_builder_.create<kir::Sync>());
     cleanSharedMemory();
   }
 
@@ -311,8 +341,8 @@ void LoopNestGenerator::handle(Expr* expr) {
       // Nothing to open
       break;
     }
-    if (kir::lowerValue(loops_to_open.front().first)->as<kir::IterDomain>() ==
-        existing_loop->iter_domain()) {
+    if (GpuLower::lowerValue(loops_to_open.front().first)
+            ->as<kir::IterDomain>() == existing_loop->iter_domain()) {
       loops_to_open.pop_front();
     }
   }
@@ -334,8 +364,9 @@ void LoopNestGenerator::handle(Expr* expr) {
   //  If this is a reduction, initialize the output (open for loops to inner
   //  most, predicate, initialize, place next after allocation if exists, close
   //  to computeAt)
-  if (out->hasReduction())
+  if (out->hasReduction()) {
     initReduction(out, expr->as<ReductionOp>()->init(), alloc_expr);
+  }
 
   //  Place the expression
   pushBack(expr);
@@ -352,7 +383,7 @@ void LoopNestGenerator::handle(Expr* expr) {
     auto ca_axis = out->getThisComputeAtAxis() - 1;
     while (for_loops.size() > 0 &&
            for_loops.back()->iter_domain() !=
-               kir::lowerValue(out->getComputeAtAxis(ca_axis).first)
+               GpuLower::lowerValue(out->getComputeAtAxis(ca_axis).first)
                    ->as<kir::IterDomain>()) {
       popFor();
     }
@@ -394,9 +425,9 @@ void findTargetTensor(Expr* expr, TensorView*& target, unsigned& score) {
   auto axis = out_tv->getRelativeComputeAtAxis();
   target = out_tv->getComputeAtView();
   while (target->hasComputeAt()) {
-    if (target->getThisComputeAtAxis() < axis)
+    if (target->getThisComputeAtAxis() < axis) {
       break;
-    TORCH_INTERNAL_ASSERT(target->getThisComputeAtAxis() == axis);
+    }
     axis = target->getComputeAtRelPos(axis);
     target = target->getComputeAtView();
   }
@@ -466,6 +497,79 @@ void sortGroup(TensorView* target, ExprListT& exprs, ExprScoreMapT& scores) {
       });
 }
 
+// Reorder expressions that are computed at the same position in a
+// breadth-first order.
+void reorderSegmentBreadthFirst(
+    ExprListT::iterator seg_begin,
+    ExprListT::const_iterator seg_end) {
+  // mapping of each expression to a bool flag indicating if it's
+  // already been visited
+  std::unordered_map<const Expr*, bool> expr_status;
+  for (auto it = seg_begin; it != seg_end; ++it) {
+    expr_status.insert({*it, false});
+  }
+
+  while (seg_begin != seg_end) {
+    std::vector<const Expr*> visited_exprs;
+    for (auto it = seg_begin; it != seg_end; ++it) {
+      const auto expr = *it;
+      const auto& expr_inputs =
+          ir_utils::filterByType<TensorView>(expr->inputs());
+      // expr can be visited if all input expressions are already
+      // visited. If an input expression is not found in expr_status,
+      // that should be safe to ignore.
+      const bool ready_to_visit = std::all_of(
+          expr_inputs.begin(),
+          expr_inputs.end(),
+          [&expr_status](const TensorView* input) {
+            const Expr* input_origin = input->getOrigin();
+            return input_origin == nullptr ||
+                expr_status.find(input_origin) == expr_status.end() ||
+                expr_status.at(input_origin);
+          });
+      if (ready_to_visit) {
+        std::iter_swap(seg_begin, it);
+        TORCH_INTERNAL_ASSERT(*seg_begin == expr);
+        ++seg_begin;
+        visited_exprs.push_back(expr);
+      }
+    }
+    for (const auto& visited_expr : visited_exprs) {
+      expr_status.at(visited_expr) = true;
+    }
+  }
+}
+
+// Reorder expressions in a group in a breadth-first order. Reordering
+// is done within a subset of expressions that have the same score
+// (i.e., computeAt position). For each subset,
+// reorderSegmentBreadthFirst is called.
+void reorderGroupBreadthFirst(ExprListT& exprs, const ExprScoreMapT& scores) {
+  auto seg_begin = exprs.begin();
+  auto seg_end = exprs.begin();
+  ScoreT seg_score = scores.at(*seg_begin);
+  while (seg_end != exprs.end()) {
+    const auto expr = *seg_end;
+    const auto cur_score = scores.at(expr);
+    if (seg_score == cur_score) {
+      // advance further
+      ++seg_end;
+      continue;
+    } else if (seg_score < cur_score) {
+      // segment ended
+      reorderSegmentBreadthFirst(seg_begin, seg_end);
+      seg_begin = seg_end;
+      seg_score = cur_score;
+    } else {
+      // expre list is assumed to be sorted in the order of scores, so
+      // this should never be reachable
+      TORCH_INTERNAL_ASSERT(
+          false, "Unexpected expression: ", expr, ", score: ", cur_score);
+    }
+  }
+  reorderSegmentBreadthFirst(seg_begin, seg_end);
+}
+
 void mergeNonRootGroupsIntoRootGroups(
     TargetGroupMapT& computed_at_exprs,
     ExprTargetMapT& target_map) {
@@ -549,6 +653,8 @@ void reorderExprsForComputeAt(std::vector<Expr*>& exprs) {
   // 2. Sort each loop-nest group based on axis (i.e., score)
   for (auto& group : computed_at_exprs) {
     sortGroup(group.first, group.second, scores);
+    // Reorder expressions in a breadth-first order
+    reorderGroupBreadthFirst(group.second, scores);
   }
 
   // 3. Merge non-root loop-nests into root loop-nests
@@ -579,7 +685,7 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   FusionGuard fg(fusion_);
 
   // Identify all shared memory TensorViews
-  // Initialize Modified status
+  // Insert into shared_memory map <tv, modify status>
   for (auto v : fusion_->vals()) {
     if (v->getValType().value() == ValType::TensorView) {
       if (v->as<TensorView>()->getMemoryType() == MemoryType::Shared) {
@@ -597,6 +703,11 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   for (auto* expr : reordered) {
     handle(expr);
   }
+
+  // Insert Dynamic Shared Memory at beginning of kernel
+  for (auto smem_alloc : dynamic_smem_) {
+    lowered_exprs.insert(lowered_exprs.begin(), smem_alloc);
+  }
 }
 
 void LoopNestGenerator::cleanSharedMemory() {
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h
index f15ea29d218f..efe056ae9fe8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.h
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.h
@@ -3,7 +3,9 @@
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 namespace torch {
@@ -27,22 +29,21 @@ namespace fuser {
  *
  */
 class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
- private:
-  // Lowered exprs to return
-  std::vector<Expr*> lowered_exprs;
-  // Fusion pointer for convenience
-  Fusion* fusion_;
-
-  // Keep all for loops conveniently to make unrolling easier, basically just a
-  // stack of the active for_loops
-  std::vector<kir::ForLoop*> for_loops;
-
-  // Track the active computeAt scope, and what view we're "computeAt-ing" into
-  std::vector<std::pair<IterDomain*, TensorView*>> compute_at_scope;
+ public:
+  static std::vector<Expr*> loweredExprs(
+      Fusion* fusion,
+      ThreadPredicateMap& thread_predicates,
+      const std::vector<Expr*>& exprs) {
+    FUSER_PERF_SCOPE("LoopNestGenerator::loweredExprs");
+    LoopNestGenerator generator(fusion, thread_predicates, exprs);
+    return generator.lowered_exprs;
+  }
 
-  // Predicates from ThreadPredicates that we will extend to reduction buffer
-  // initialization
-  ThreadPredicateMap& thread_predicates_;
+ private:
+  LoopNestGenerator(
+      Fusion* fusion,
+      ThreadPredicateMap& thread_predicates,
+      const std::vector<Expr*>& exprs);
 
   // Create the allocation for tv, place it inside the loop associated with
   // alloc_id, return the node
@@ -52,6 +53,10 @@ class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
   // Tracks if shared memory is modified
   std::unordered_map<Val*, bool> smem_;
 
+  // Track dynamic shared memory buffer
+  // Insert allocation at the beginning of the kernel
+  std::deque<kir::Allocate*> dynamic_smem_;
+
   // Clear the modify status for all shared memory buffers
   void cleanSharedMemory();
 
@@ -84,22 +89,26 @@ class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
   // Run the pass and accumulate output in lowered_exprs
   void generate(const std::vector<Expr*>& exprs);
 
-  LoopNestGenerator(
-      Fusion* _fusion,
-      ThreadPredicateMap& _thread_predicates,
-      const std::vector<Expr*>& exprs)
-      : fusion_(_fusion), thread_predicates_(_thread_predicates) {
-    generate(exprs);
-  }
+ private:
+  // Lowered exprs to return
+  std::vector<Expr*> lowered_exprs;
 
- public:
-  static std::vector<Expr*> loweredExprs(
-      Fusion* _fusion,
-      ThreadPredicateMap& _thread_predicates,
-      const std::vector<Expr*>& exprs) {
-    LoopNestGenerator generator(_fusion, _thread_predicates, exprs);
-    return generator.lowered_exprs;
-  }
+  // Fusion pointer for convenience
+  Fusion* fusion_;
+
+  // Keep all for loops conveniently to make unrolling easier, basically just a
+  // stack of the active for_loops
+  std::vector<kir::ForLoop*> for_loops;
+
+  // Track the active computeAt scope, and what view we're "computeAt-ing" into
+  std::vector<std::pair<IterDomain*, TensorView*>> compute_at_scope;
+
+  // Predicates from ThreadPredicates that we will extend to reduction buffer
+  // initialization
+  ThreadPredicateMap& thread_predicates_;
+
+  // Kernel IR builder
+  kir::IrBuilder ir_builder_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
index 7cfa01f29a0e..03311dc43ebf 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@@ -1,9 +1,13 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -12,33 +16,38 @@ namespace {
 
 Val* getPredicatePerParallelType(
     ParallelType pt,
-    const ThreadPredicateMap::SourceMapType::mapped_type& sources) {
+    const ThreadPredicateMap::SourceMapType& source_map) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (pt == ParallelType::BIDx || pt == ParallelType::BIDy ||
       pt == ParallelType::BIDz) {
-    TORCH_INTERNAL_ASSERT(!sources.empty(), "No predicate source found");
-    TORCH_INTERNAL_ASSERT(sources.size() == 1, "Multiple sources detected");
-    auto src = *sources.begin();
+    auto source = source_map.at(pt);
+    TORCH_INTERNAL_ASSERT(!source.empty(), "No predicate source found");
+    TORCH_INTERNAL_ASSERT(source.size() == 1, "Multiple sources detected");
+    auto src = *source.begin();
     auto flag_name = kir::GridReduction::getPredicateFlagName(src);
-    return new kir::NamedScalar(flag_name, DataType::Bool);
+    return ir_builder.create<kir::NamedScalar>(flag_name, DataType::Bool);
   } else {
-    return kir::eqExpr(kir::NamedScalar::getParallelIndex(pt), new kir::Int(0));
+    return ir_builder.eqExpr(
+        kir::NamedScalar::getParallelIndex(pt), ir_builder.create<kir::Int>(0));
   }
 }
 
 kir::Bool* getPredicate(
     const ir_utils::ParallelTypeBitmap& bits,
-    const ThreadPredicateMap::SourceMapType& sources) {
+    const ThreadPredicateMap::SourceMapType& source_map) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (bits.none()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* pred = nullptr;
 
   for (const auto& pt_bool : bits.getMap()) {
     if (pt_bool.second) {
-      auto tp =
-          getPredicatePerParallelType(pt_bool.first, sources.at(pt_bool.first));
-      pred = (pred == nullptr) ? tp : kir::andExpr(pred, tp);
+      auto tp = getPredicatePerParallelType(pt_bool.first, source_map);
+      pred = (pred == nullptr) ? tp : ir_builder.andExpr(pred, tp);
     }
   }
 
@@ -88,10 +97,27 @@ void maskSouceMap(
   }
 }
 
+// A bit of a hack for now for GEMM tiling so we don't fetch tiles multiple
+// times. It's safe to do, there may simply be a better place to do it.
+void avoidRedundantWritesToSmem(
+    TensorView* out_tv,
+    ir_utils::ParallelTypeBitmap& pred) {
+  if (out_tv->getMemoryType() == MemoryType::Shared) {
+    for (size_t i = 0; i < out_tv->nDims(); i++) {
+      auto id = out_tv->getComputeAtAxis(i).first;
+      if (out_tv->axis(i)->isBroadcast() && id->isThreadDim()) {
+        pred.set(id->getParallelType(), true);
+      }
+    }
+  }
+}
+
 } // namespace
 
 // Update the reduction_deps bitset based on provided Expr
 void ThreadPredicateMap::updateBitSet(Expr* expr) {
+  FUSER_PERF_SCOPE("ThreadPredicateMap::updateBitSet");
+
   // Which predicates were set for the inputs
   ir_utils::ParallelTypeBitmap input_preds;
 
@@ -170,19 +196,23 @@ void ThreadPredicateMap::updateBitSet(Expr* expr) {
 
   // Get rid of any reductions which are bcasted
   output_preds &= bcast_reset_map;
-  // Similarly, drop non-relevant source tensos
+  // Similarly, drop non-relevant source tensors
   maskSouceMap(src_map, bcast_reset_map);
 
   // Run through outputs and set bitset predicates
-  for (const auto* out : expr->outputs()) {
+  for (auto* out : expr->outputs()) {
     if (!ir_utils::isTV(out))
       continue;
     TORCH_INTERNAL_ASSERT(find(ir_utils::asConstTV(out)) == end());
-    insert(ir_utils::asConstTV(out), output_preds, src_map);
+    auto pred_for_this_out = output_preds;
+    avoidRedundantWritesToSmem(ir_utils::asTV(out), pred_for_this_out);
+    insert(ir_utils::asConstTV(out), pred_for_this_out, src_map);
   }
 }
 
+// TODO(kir): revisit this - can we build it from the kernel IR?
 ThreadPredicateMap::ThreadPredicateMap(Fusion* _fusion) : fusion_(_fusion) {
+  FUSER_PERF_SCOPE("ThreadPredicateMap");
   // Initialize mapping for input tensors
   for (auto inp : fusion_->inputs()) {
     if (ir_utils::isTV(inp)) {
@@ -243,9 +273,9 @@ void ThreadPredicateMap::duplicate(
   }
 }
 
-kir::Bool* ThreadPredicateMap::getExpr(const TensorView* tv) const {
-  TORCH_INTERNAL_ASSERT(find(tv) != end(), "Couldn't find ", tv);
-  return getPredicate(at(tv).first, at(tv).second);
+kir::Bool* ThreadPredicateMap::getExpr(const TensorView* out_tv) const {
+  TORCH_INTERNAL_ASSERT(find(out_tv) != end(), "Couldn't find ", out_tv);
+  return getPredicate(at(out_tv).first, at(out_tv).second);
 }
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
index 184640280283..ab321dc530c8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
@@ -10,16 +10,17 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-/*
- * Map from tensorview to bit set represnting <BIDx, BIDy, BIDz, TIDx, TIDy,
- * TIDz> If any dependency of TV had a parallelized reduction, we will track
- * it here. This will be used for predicate generation to prevent
- * parallelization on that axis. This is important if we have a reduction on
- * for example TIDx, as the reduced value is only valid on threadIdx.x == 0
- * therefore if we use that value later in the kernel we have that predicate.
- * If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
- * no longer need the predicate and can reset the bit accordingly
- */
+//! Maps TensorViews to std::pair<ir_utils::ParallelTypeBitmap, SourceMapType>>
+//!
+//! Map from tensorview to bit set represnting <BIDx, BIDy, BIDz, TIDx, TIDy,
+//! TIDz> If any dependency of TV had a parallelized reduction, we will track
+//! it here. This will be used for predicate generation to prevent
+//! parallelization on that axis. This is important if we have a reduction on
+//! for example TIDx, as the reduced value is only valid on threadIdx.x == 0
+//! therefore if we use that value later in the kernel we have that predicate.
+//! If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
+//! no longer need the predicate and can reset the bit accordingly
+//!
 class TORCH_CUDA_API ThreadPredicateMap {
  public:
   using SourceMapType = std::unordered_map<
@@ -41,13 +42,10 @@ class TORCH_CUDA_API ThreadPredicateMap {
 
   void duplicate(const TensorView* copy, const TensorView* origin);
 
-  // Returns a Bool predicate expression for a given TensorView.
-  kir::Bool* getExpr(const TensorView* tv) const;
+  // Returns a Bool predicate expression for a given output TensorView.
+  kir::Bool* getExpr(const TensorView* out_tv) const;
 
  private:
-  Fusion* fusion_;
-  MapType thread_predicates_;
-
   // Update the thread_predicates bitset based on provided Expr
   void updateBitSet(Expr*);
 
@@ -56,6 +54,10 @@ class TORCH_CUDA_API ThreadPredicateMap {
       const ir_utils::ParallelTypeBitmap& pred,
       const SourceMapType& src_map);
   void insert(const TensorView* tv, const MapType::mapped_type& pred_and_src);
+
+ private:
+  Fusion* fusion_ = nullptr;
+  MapType thread_predicates_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
index 303875275e2d..51fd7f0b1b82 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -1,11 +1,15 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -36,8 +40,10 @@ void UnrollPass::handle(Expr* expr) {
     // If we need a predicate, put expr inside an if then else
     if (!(pred->isConst()) || !(pred->isConst() && pred->value().value())) {
       non_trivial_pred_found = true;
+      kir::IrBuilder ir_builder(GpuLower::current()->kernel());
       kir::IfThenElse* inline_ite =
-          new kir::IfThenElse(pred, {expr}, {}, for_loops.back());
+          ir_builder.create<kir::IfThenElse>(pred, for_loops.back());
+      inline_ite->thenBody().push_back(expr);
       for_loops.back()->body().insert_before(expr, inline_ite);
       for_loops.back()->body().erase(expr);
     }
@@ -72,13 +78,14 @@ void UnrollPass::handle(kir::ForLoop* fl) {
 
   kir::ForLoop* parent_scope = for_loops.empty() ? nullptr : for_loops.back();
 
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
   kir::IfThenElse* unroll_ite =
-      new kir::IfThenElse(unroll_pred, {}, {}, parent_scope);
+      ir_builder.create<kir::IfThenElse>(unroll_pred, parent_scope);
 
   // Get the loop nest for the unrolled path
   kir::ForLoop* unrolled_loop_nest = scope_utils::cloneLoopNest(fl, unroll_ite);
 
-  unroll_ite->body().push_back(unrolled_loop_nest);
+  unroll_ite->thenBody().push_back(unrolled_loop_nest);
 
   // Loop nest for inlined path
   kir::ForLoop* inlined_loop = scope_utils::cloneLoopNest(fl, unroll_ite);
@@ -99,6 +106,8 @@ void UnrollPass::handle(kir::ForLoop* fl) {
 
 // Generate the loop nest structure and place it in lowered_exprs
 void UnrollPass::computeMap() {
+  FUSER_PERF_SCOPE("UnrollPass::computeMap");
+
   FusionGuard fg(fusion_);
 
   // Run through loop nests and further lower the expressions
@@ -111,6 +120,7 @@ std::vector<Expr*> UnrollPass::runPass(
     Fusion* fusion,
     const std::vector<Expr*>& exprs,
     const ThreadPredicateMap& thread_predicates) {
+  FUSER_PERF_SCOPE("UnrollPass::runPass");
   FusionGuard fg(fusion);
   UnrollPass up(fusion, exprs, thread_predicates);
   up.computeMap();
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h
index 238f4de30f60..f77b8f37c810 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.h
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h
@@ -29,12 +29,12 @@ namespace fuser {
  *     if( i * 4 + 3 < I && j * 128 + 127 < J ){
  *       for( k : I0i{4} )
  *         for( l : I1i{128} )
- *           T0[ ( i * 4 + k ) * J + j * 128 + l ] = …
+ *           T0[ ( i * 4 + k ) * J + j * 128 + l ] = ...
  *     } else {
  *       for( k : I0i{4} )
  *         for( l : I1i{128} )
  *           if( i * 4 + k < I && j * 128 + l < J)
- *              T0[ ( i * 4 + k ) * J + j * 128 + l ] = …
+ *              T0[ ( i * 4 + k ) * J + j * 128 + l ] = ...
  *     }
  *
  *   }
@@ -50,7 +50,7 @@ namespace fuser {
 
 class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
  private:
-  // Wrapper to access thread_predicates_
+  // Wrapper to access thread_predicates_ based on an output TV
   kir::Bool* getThreadPredicate(TensorView*);
 
   // We will track which loops in the incomming IR will be replaced and by what
@@ -92,7 +92,7 @@ class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
       : fusion_(_fusion),
         incoming_exprs_(_incoming_exprs),
         thread_predicates_(_thread_predicates) {
-    auto p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true));
+    p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true));
   }
 
   // Generate the for Expr replacement map
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
index 1e25cb7a758e..262cb5a7d4c0 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -1,7 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 #include <algorithm>
@@ -48,7 +52,7 @@ class scopePushBack : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    ite->body().push_back(expr_);
+    ite->thenBody().push_back(expr_);
   }
 
   void handle(Expr* expr) final {
@@ -76,7 +80,7 @@ class scopeInsertBefore : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    ite->body().insert_before(ref_, expr_);
+    ite->thenBody().insert_before(ref_, expr_);
   }
 
   void handle(Expr* expr) final {
@@ -107,7 +111,7 @@ class ExprInScope : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    if (ite->body().contains(expr_)) {
+    if (ite->thenBody().contains(expr_)) {
       contains_ = true;
     }
   }
@@ -166,15 +170,15 @@ class CloneLoopNest : public OptOutMutator {
   Expr* to_clone_ = nullptr;
 
   Statement* mutate(kir::ForLoop* fl) final {
-    std::vector<Expr*> mutated_exprs;
+    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+    const auto parent_scope =
+        fl == to_clone_ ? parent_scope_ : fl->parentScope();
+    auto new_loop = ir_builder.create<kir::ForLoop>(
+        fl->index(), fl->iter_domain(), parent_scope);
     for (Expr* expr : fl->body().exprs()) {
-      mutated_exprs.push_back(ir_utils::asExpr(OptOutMutator::mutate(expr)));
+      new_loop->body().push_back(ir_utils::asExpr(OptOutMutator::mutate(expr)));
     }
-    if (fl == to_clone_)
-      return new kir::ForLoop(
-          fl->index(), fl->iter_domain(), mutated_exprs, parent_scope_);
-    return new kir::ForLoop(
-        fl->index(), fl->iter_domain(), mutated_exprs, fl->parentScope());
+    return new_loop;
   }
 
   CloneLoopNest(Expr* _to_clone, Expr* _parent_scope)
@@ -223,7 +227,7 @@ class ReplaceExprsInScope : public OptOutDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    handleScope(ite->body());
+    handleScope(ite->thenBody());
     handleScope(ite->elseBody());
   }
 
@@ -246,7 +250,7 @@ class FirstInnerMostScope : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    for (auto expr : ite->body().exprs()) {
+    for (auto expr : ite->thenBody().exprs()) {
       if (ir_utils::isScope(expr)) {
         active_scope = expr;
         return;
@@ -323,15 +327,19 @@ Expr* getParent(Expr* scope) {
 
 // Open a new inner most for loop
 kir::ForLoop* openFor(Expr* scope, IterDomain* id) {
-  const auto kir_id = kir::lowerValue(id)->as<kir::IterDomain>();
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  const auto kir_id = GpuLower::lowerValue(id)->as<kir::IterDomain>();
   kir::ForLoop* new_scope = nullptr;
   if (id->isThread()) {
     std::stringstream ss;
     ss << id->getParallelType();
-    new_scope = new kir::ForLoop(
-        new kir::NamedScalar(ss.str(), DataType::Int), kir_id, {}, scope);
+    new_scope = ir_builder.create<kir::ForLoop>(
+        ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int),
+        kir_id,
+        scope);
   } else {
-    new_scope = new kir::ForLoop(new kir::Int(c10::nullopt), kir_id, {}, scope);
+    new_scope = ir_builder.create<kir::ForLoop>(
+        ir_builder.create<kir::Int>(c10::nullopt), kir_id, scope);
   }
   if (scope != nullptr)
     pushBack(scope, new_scope);
@@ -601,8 +609,16 @@ ParallelTypeBitmap getParallelBroadcastDomains(
 
   ParallelTypeBitmap parallel_broadcast;
   const auto& iter_domains = out_tv->domain()->domain();
+  // If the output is on shared memory, assume that all subsequent
+  // reads from all threads in its CTA can be done with no parallel
+  // broadcast. Only one thread will write to shared memory followed
+  // by a proper _syncthreads.
+  const bool output_smem = out_tv->getMemoryType() == MemoryType::Shared;
   for (auto id : iter_domains) {
-    if (id->isBroadcast() && id->isThread()) {
+    if (!id->isBroadcast()) {
+      continue;
+    }
+    if (id->isBlockDim() || (!output_smem && id->isThreadDim())) {
       parallel_broadcast.set(id->getParallelType(), true);
     }
   }
@@ -633,7 +649,7 @@ std::pair<kir::ForLoop*, int64_t> getAllocPoint(
     // Grab the axis ID
 
     auto ca_id = tv->getComputeAtAxis(tv_i).first;
-    auto kir_ca_id = kir::lowerValue(ca_id)->as<kir::IterDomain>();
+    auto kir_ca_id = GpuLower::lowerValue(ca_id)->as<kir::IterDomain>();
 
     loops_it =
         std::find_if(loops_it, loops.end(), [&kir_ca_id](const auto& loop) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
index 593d6172c988..5e1715c51b89 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -1,5 +1,6 @@
-
 #include <torch/csrc/jit/codegen/cuda/lower_validation.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -9,6 +10,8 @@ namespace jit {
 namespace fuser {
 
 void validateIr(Fusion* fusion) {
+  FUSER_PERF_SCOPE("validateIr");
+
   FusionGuard fg(fusion);
 
   auto used_vals = DependencyCheck::getAllValsBetween(
diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp
index 51407ea7fca9..ddddce75ad9e 100644
--- a/torch/csrc/jit/codegen/cuda/manager.cpp
+++ b/torch/csrc/jit/codegen/cuda/manager.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
@@ -47,20 +48,6 @@ namespace cuda {
 
 namespace {
 
-c10::Device getDevice(const at::ArrayRef<IValue>& inputs) {
-  // find device in inputs.
-  for (const auto& input : inputs) {
-    if (input.isTensor()) {
-      auto dev = input.toTensor().device();
-      TORCH_INTERNAL_ASSERT(
-          dev.is_cuda(), "Could only fuser operations on cuda device");
-      return dev;
-    }
-  }
-  TORCH_INTERNAL_ASSERT(
-      false, "Could not detect device of inputs to a fusion.");
-}
-
 // CudaFusionManager is not thread safe!
 // TODO: we should make the tradeoff here to use thread_local instead of global
 // singleton;
@@ -99,7 +86,6 @@ class CudaFusionManager {
 
   std::vector<at::Tensor> runFusionNode(
       int32_t kernel_id,
-      std::shared_ptr<Graph>& graph,
       const at::ArrayRef<IValue> inputs) {
     std::lock_guard<std::mutex> guard(mutex_);
     return graph_cache_[kernel_id]->runGraphWithInputs(inputs);
@@ -216,19 +202,32 @@ class CudaFusionManager {
 } // namespace
 
 void compileCudaFusionGroup(Node* fusion_node) {
+  FUSER_PERF_SCOPE("compileCudaFusionGroup");
+
   TORCH_CHECK(
       fusion_node->kind() == prim::CudaFusionGroup,
       "Only prim::CudaFusionGroup can be compiled");
   if (fusion_node->hasAttribute(attr::cache_id)) {
     TORCH_WARN("Double registration of CudaFusionGroup on CudaFusionManager");
   }
+  // This is not a critical code path, it's OK to do graph copy here;
+  auto graph = fusion_node->g(attr::Subgraph)->copy();
+
+  // type propagation is needed, as the protocol only requires scalar type on
+  // input tensors.
+  // Note that even for Profiling Executor, scalar type could still be missing,
+  // especially for output tensor from a given node (as profiling node only
+  // insert meta information after itself).
+  TypePropagate(graph);
+
   int32_t fusion_cache_id =
-      CudaFusionManager::getManager().registerOrGetCacheId(
-          fusion_node->g(attr::Subgraph));
+      CudaFusionManager::getManager().registerOrGetCacheId(graph);
   fusion_node->i_(attr::cache_id, fusion_cache_id);
 }
 
 void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
+  FUSER_PERF_SCOPE("runCudaFusionGroup");
+
   TORCH_CHECK(
       fusion_node->kind() == prim::CudaFusionGroup,
       "prim::CudaFusionGroup expected");
@@ -240,31 +239,14 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
   int32_t kernel_id = fusion_node->i(attr::cache_id);
 
   // Currently we just construct I/O tensors for static graph;
-  std::shared_ptr<Graph> graph = fusion_node->g(attr::Subgraph)->copy();
+
+  const auto nInputs = fusion_node->g(attr::Subgraph)->inputs().size();
 
   auto execute_lambda = [&]() {
-    const auto nInputs = graph->inputs().size();
     at::ArrayRef<IValue> inputs = last(stack, nInputs);
 
-    // TODO: we would/could want an extra layer of graph cache in order to
-    //       handle varying contiguity/broadcast;
-    // Only needed if we are doing codegen
-    // if no shape information available, we feed current shape into the kernel;
-    // This is needed because our current broadcast on size-1 dimension
-    if (!IsNewExecutorEnabled()) {
-      EraseShapeInformation(graph);
-      for (size_t i = 0; i < nInputs; i++) {
-        graph->inputs()[i]->setType(inputs[i].type());
-      }
-      // Type propagation that's here just to cover corner case, incase type
-      // propagation failed in the original subgraph. We currently need output
-      // types in order to support fp16, where we cast input to fp32 and output
-      // back to fp16.
-      TypePropagate(graph);
-    }
-
     auto outputs =
-        CudaFusionManager::getManager().runFusionNode(kernel_id, graph, inputs);
+        CudaFusionManager::getManager().runFusionNode(kernel_id, inputs);
 
     drop(stack, inputs.size());
     stack.insert(
@@ -286,8 +268,10 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
           "Failed for some reason. To debug try disable codegen fallback path"
           "via setting the env variable"
           "`export PYTORCH_CUDA_FUSER_DISABLE_FALLBACK=1`");
-      EraseShapeInformation(graph);
-      InterpreterState{Code(graph, "fallback_cuda_fuser")}.run(stack);
+      // copying graph here since we are eliminating shape information;
+      auto copied_graph = fusion_node->g(attr::Subgraph)->copy();
+      EraseShapeInformation(copied_graph);
+      InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack);
     }
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index b7ab0cf3017b..ea963332fa6d 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 
@@ -32,37 +33,6 @@ typedef Expr* CgOp;
 typedef void (*ParseFuncPtr)(const Node*, std::unordered_map<size_t, CgValue>&);
 typedef bool (*MergeQueryFuncPtr)(const Node*);
 
-std::vector<int> reductionAxes(TensorView* tv) {
-  size_t n_dims = tv->nDims();
-  std::vector<int> reduction_axes;
-  for (size_t i = 0; i < n_dims; i++) {
-    if (tv->axis(i)->isReduction()) {
-      reduction_axes.emplace_back(i);
-    }
-  }
-  return reduction_axes;
-}
-
-// coalesces all reduction to the right side and returns total number of
-// reduction axes
-size_t coalescReduction(TensorView* tv) {
-  auto reduction_axes = reductionAxes(tv);
-  size_t n_dims = tv->nDims();
-  std::unordered_map<int, int> coalesc_permute;
-  for (size_t i = 0; i < reduction_axes.size(); i++) {
-    size_t new_pos = i + n_dims - reduction_axes.size();
-    if (new_pos == size_t(reduction_axes[i])) {
-      break;
-    } else {
-      coalesc_permute[reduction_axes[i]] = new_pos;
-    }
-  }
-  if (!coalesc_permute.empty()) {
-    tv->reorder(coalesc_permute);
-  }
-  return reduction_axes.size();
-}
-
 // TODO: add a mutex to make it thread safe.
 class IrParser {
   class RegistrationEntry {
@@ -507,6 +477,14 @@ class IrParser {
             // we don't support cast of output types yet;
             if (!node->inputs()[3]->type()->isSubtypeOf(
                     static_cast<c10::TypePtr>(NoneType::get()))) {
+              // We can only handle output as half and float;
+              if (const auto opt_ivalue = toIValue(node->input(3))) {
+                const auto scalar_type = opt_ivalue->toScalarType();
+                if (scalar_type == at::ScalarType::Float ||
+                    scalar_type == at::ScalarType::Half) {
+                  return true;
+                }
+              }
               return false;
             }
             // we don't support dynamic reduction axes;
@@ -661,6 +639,8 @@ bool isNodeParsible(const Node* node) {
 }
 
 std::unique_ptr<Fusion> parseJitIR(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("parseJitIR");
+
   IrParser parser(graph);
   return parser.parse();
 }
diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/torch/csrc/jit/codegen/cuda/partition.cpp
index b242a96b7665..5c839864665b 100644
--- a/torch/csrc/jit/codegen/cuda/partition.cpp
+++ b/torch/csrc/jit/codegen/cuda/partition.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/partition.h>
 #include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 
 namespace torch {
@@ -290,6 +291,8 @@ bool createTrickyBroadcast(const Node* consumer, const Node* producer) {
 } // namespace
 
 bool isFusableCudaFusionGroup(const Node* node) {
+  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");
+
   if (isFusableNode(node)) {
     return isFusableDevice(node);
   }
@@ -297,6 +300,8 @@ bool isFusableCudaFusionGroup(const Node* node) {
 }
 
 bool isFusableCudaFusionGroup(const Node* fusion, const Node* node) {
+  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");
+
   // TODO: lift the restriction of not fusing producer containing reduction when
   //       we have proper scheduling.
   if (isFusableCudaFusionGroup(node) && !hasReductionOperation(node) &&
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
index 6f39e2f7dfc8..5a0eb3fcf8f4 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
@@ -1,9 +1,13 @@
+
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 
@@ -15,6 +19,8 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
     const TensorView* tv,
     const std::vector<Val*>& indices,
     bool use_rfactor) {
+  FUSER_PERF_SCOPE("computePredicates");
+
   const std::vector<IterDomain*>& root =
       use_rfactor ? tv->getMaybeRFactorDomain() : tv->getRootDomain();
 
@@ -31,7 +37,9 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
     return {};
   }
 
-  auto true_bool = new kir::Bool(true);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
+  auto true_bool = ir_builder.create<kir::Bool>(true);
   std::vector<kir::Bool*> preds(root.size(), true_bool);
   Val* extent = nullptr;
 
@@ -45,19 +53,21 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
       extent = nullptr;
       continue;
     } else if (zero_ind) {
-      if (root[i]->extent()->isOneInt())
+      if (root[i]->extent()->isOneInt()) {
         continue;
+      }
+      const auto lowered_extent = GpuLower::lowerValue(root[i]->extent());
       if (extent == nullptr) {
-        extent = kir::lowerValue(root[i]->extent());
+        extent = lowered_extent;
       } else {
-        extent = kir::mulExpr(extent, kir::lowerValue(root[i]->extent()));
+        extent = ir_builder.mulExpr(extent, lowered_extent);
       }
     } else {
-      auto local_extent = kir::lowerValue(root[i]->extent());
+      auto local_extent = GpuLower::lowerValue(root[i]->extent());
       if (extent != nullptr) {
-        local_extent = kir::mulExpr(extent, local_extent);
+        local_extent = ir_builder.mulExpr(extent, local_extent);
       }
-      auto pred = kir::ltExpr(indices[i], local_extent);
+      auto pred = ir_builder.ltExpr(indices[i], local_extent);
       extent = nullptr;
       TORCH_INTERNAL_ASSERT(
           pred->getValType().value() == ValType::KirScalar &&
@@ -71,9 +81,22 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
 kir::Bool* PredicateCompute::getInlinePredicate(
     Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
-    kir::Bool* thread_pred) {
+    kir::Bool* thread_pred,
+    bool ignore_block_grid_reductions) {
+  FUSER_PERF_SCOPE("getInlinePredicate");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (loops.empty()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
+  }
+
+  // Handle these elsewhere
+  if (ignore_block_grid_reductions &&
+      expr->getExprType() == ExprType::ReductionOp &&
+      (expr->as<ReductionOp>()->out()->as<TensorView>()->hasBlockReduction() ||
+       expr->as<ReductionOp>()->out()->as<TensorView>()->hasGridReduction())) {
+    return ir_builder.create<kir::Bool>(true);
   }
 
   TORCH_INTERNAL_ASSERT(
@@ -117,7 +140,7 @@ kir::Bool* PredicateCompute::getInlinePredicate(
     // buffer. If we're initing a reduction buffer don't generate an inline
     // predicate.
     if (!has_tv_inputs) {
-      return new kir::Bool(true);
+      return ir_builder.create<kir::Bool>(true);
     }
   }
 
@@ -136,13 +159,13 @@ kir::Bool* PredicateCompute::getInlinePredicate(
       preds.push_back(pred);
 
   if (preds.empty()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* cond = preds[0];
 
   for (decltype(preds.size()) i{1}; i < preds.size(); i++) {
-    cond = kir::andExpr(cond, preds[i]);
+    cond = ir_builder.andExpr(cond, preds[i]);
   }
 
   TORCH_INTERNAL_ASSERT(
@@ -158,15 +181,19 @@ kir::Bool* UnrollPredicate::get(
     const std::vector<kir::ForLoop*>& outer_loops,
     kir::ForLoop* unrolled_loop,
     const std::unordered_map<IterDomain*, IterDomain*>& p2c_root_map) {
+  FUSER_PERF_SCOPE("UnrollPredicate::get");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   UnrollPredicate up(outer_loops, unrolled_loop, p2c_root_map);
 
   std::unordered_set<kir::Bool*> pred_set;
-  for (auto entry : up.predicates) {
+  for (auto entry : up.predicates_) {
     pred_set.emplace(entry.second);
   }
 
-  if (up.predicates.empty()) {
-    return new kir::Bool(true);
+  if (up.predicates_.empty()) {
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* unroll_pred = nullptr;
@@ -174,7 +201,7 @@ kir::Bool* UnrollPredicate::get(
     if (unroll_pred == nullptr) {
       unroll_pred = pred;
     } else {
-      unroll_pred = kir::andExpr(unroll_pred, pred);
+      unroll_pred = ir_builder.andExpr(unroll_pred, pred);
     }
   }
   TORCH_INTERNAL_ASSERT(
@@ -184,8 +211,11 @@ kir::Bool* UnrollPredicate::get(
 }
 
 void UnrollPredicate::predicateOn(Expr* tv_expr) {
-  if (for_loops.empty())
+  FUSER_PERF_SCOPE("UnrollPredicate::predicateOn");
+
+  if (for_loops_.empty()) {
     return;
+  }
 
   auto out_tv = ir_utils::getTVOutput(tv_expr);
 
@@ -210,7 +240,7 @@ void UnrollPredicate::predicateOn(Expr* tv_expr) {
   }
 
   auto pred_inds = Index::getConsumerRootPredIndices(
-      out_tv, for_loops, pred_contiguity, true);
+      out_tv, for_loops_, pred_contiguity, true);
   auto root_indices = pred_inds.first;
   auto use_rfactor = pred_inds.second;
 
@@ -229,12 +259,14 @@ void UnrollPredicate::predicateOn(Expr* tv_expr) {
       continue;
     }
     auto term_id = loop_utils::getTermIDInMap(root_dom[i], p2c_root_map_);
-    predicates[term_id] = all_preds[i];
+    predicates_[term_id] = all_preds[i];
   }
 }
 
 void UnrollPredicate::openLoop(kir::ForLoop* fl) {
-  for_loops.push_back(fl);
+  FUSER_PERF_SCOPE("UnrollPredicate::openLoop");
+
+  for_loops_.push_back(fl);
 
   for (auto expr : fl->body().exprs()) {
     if (ir_utils::isTVOp(expr)) {
@@ -244,14 +276,14 @@ void UnrollPredicate::openLoop(kir::ForLoop* fl) {
     }
   }
 
-  for_loops.pop_back();
+  for_loops_.pop_back();
 }
 
 UnrollPredicate::UnrollPredicate(
     std::vector<kir::ForLoop*> outer_loops,
     kir::ForLoop* unrolled_loop,
     const std::unordered_map<IterDomain*, IterDomain*>& _p2c_root_map)
-    : for_loops(std::move(outer_loops)), p2c_root_map_(_p2c_root_map) {
+    : for_loops_(std::move(outer_loops)), p2c_root_map_(_p2c_root_map) {
   openLoop(unrolled_loop);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/torch/csrc/jit/codegen/cuda/predicate_compute.h
index f4bee4f74dda..3c6d86106fe4 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.h
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.h
@@ -45,7 +45,8 @@ class PredicateCompute {
   static kir::Bool* getInlinePredicate(
       Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
-      kir::Bool* thread_pred);
+      kir::Bool* thread_pred,
+      bool ignore_block_grid_reductions = true);
 };
 
 class TORCH_CUDA_API UnrollPredicate {
@@ -65,8 +66,9 @@ class TORCH_CUDA_API UnrollPredicate {
 
   void openLoop(kir::ForLoop*);
 
-  std::unordered_map<IterDomain*, kir::Bool*> predicates;
-  std::vector<kir::ForLoop*> for_loops;
+ private:
+  std::unordered_map<IterDomain*, kir::Bool*> predicates_;
+  std::vector<kir::ForLoop*> for_loops_;
 
   const std::unordered_map<IterDomain*, IterDomain*>& p2c_root_map_;
 };
diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp
index c3e2f10c0f62..f9bc25ca711e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -15,7 +16,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-constexpr int kUnrollFactor = 4;
+constexpr int kUnrollFactor = 1;
 
 namespace {
 
@@ -30,188 +31,109 @@ std::vector<int> reductionAxes(TensorView* tv) {
   return reduction_axes;
 }
 
-// coalesces all reduction to the right side and returns total number of
+// Merge all reduction to the right side and returns total number of
 // reduction axes
-size_t coalescReduction(TensorView* tv) {
-  auto reduction_axes = reductionAxes(tv);
-  size_t n_dims = tv->nDims();
-  std::unordered_map<int, int> coalesc_permute;
-  for (size_t i = 0; i < reduction_axes.size(); i++) {
-    size_t new_pos = i + n_dims - reduction_axes.size();
-    if ((int)new_pos == reduction_axes[i]) {
-      break;
+size_t mergeReduction(TensorView* tv) {
+  int prev_i = -1;
+  size_t num_merged = 0;
+  for (int i = static_cast<int>(tv->nDims()) - 1; i >= 0; i--) {
+    if (!tv->axis(i)->isReduction()) {
+      continue;
+    }
+    if (prev_i == -1) {
+      prev_i = i;
+    } else {
+      tv->merge(i, prev_i);
+      prev_i = i;
+      num_merged++;
+    }
+  }
+  if (prev_i == 0) {
+    tv->reorder({{prev_i, -1}});
+  }
+
+  return prev_i == -1 ? 0 : num_merged + 1;
+}
+
+// merge all non-reduction axes to the left side and returns total number of
+// iteration axes
+size_t mergeNonReduction(TensorView* tv) {
+  int prev_i = -1;
+  size_t num_merged = 0;
+  for (int i = static_cast<int>(tv->nDims()) - 1; i >= 0; i--) {
+    if (tv->axis(i)->isReduction()) {
+      continue;
+    }
+    if (prev_i == -1) {
+      prev_i = i;
     } else {
-      coalesc_permute[reduction_axes[i]] = new_pos;
+      tv->merge(i, prev_i);
+      prev_i = i;
+      num_merged++;
     }
   }
-  if (!coalesc_permute.empty()) {
-    tv->reorder(coalesc_permute);
+  if (prev_i != 0) {
+    tv->reorder({{prev_i, 0}});
   }
-  return reduction_axes.size();
+
+  return prev_i == -1 ? 0 : num_merged + 1;
 }
 
 } // namespace
 
 // This one is a total mess and it should go.
 bool scheduleFusion(Fusion* fusion, const at::ArrayRef<c10::IValue> inputs) {
+  FUSER_PERF_SCOPE("scheduleFusion");
+
   FusionGuard fg(fusion);
   // maybe has_reduction for scheudling should be done on a per output tensor
   // basis.
-  const bool has_reduction = fusion->hasReduction();
-  const bool disable_unroll = fusion->hasRNG();
-  bool fcd_reduction = false;
+  TORCH_INTERNAL_ASSERT(
+      !fusion->hasReduction(), "This scheduler only handles pointwise ops.");
+  const bool disable_unroll = fusion->isStochastic();
 
   for (auto out_val : fusion->outputs()) {
     auto out = out_val->as<TensorView>();
-    if (has_reduction) {
-      // TODO: this scheduling only works for a single reduction operation in
-      //       the fusion, in this case we can coalesc all reduction axes and
-      //       merge them together. (same applies to iteration axes)
-      // TODO: does this work for multiple outputs?
-
-      // query if fastest changing dimension (FCD) is a reduction
-      fcd_reduction = out->axis((int)out->nDims() - 1)->isReduction();
-
-      // We coalesc all reduction axes to the right;
-      size_t num_reduction_axes = coalescReduction(out);
-
-      // Merge all iteration dimensions
-      while (out->nDims() > num_reduction_axes + 1) {
-        // we merge the last two iterative axes;
-        out->merge(static_cast<int>(out->nDims() - num_reduction_axes) - 2);
-      }
-      // Merge all reduction dimensions
-      while (out->nDims() > 2) {
-        out->merge(-2, -1);
-      }
-    } else {
-      // Merge all dimensions because we're only supporting pointwise
-      while (out->nDims() > 1)
-        out->merge(-2, -1);
-    }
-  }
-
-  if (has_reduction) {
-    // Run through outputs, grab all inputs of outputs
-    // squeeze with computeAt to set overall structure.
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-
-      // launch configuratoin.
-      TensorView* intermediate = nullptr;
-      if (fcd_reduction) {
-        out_tv->split(-1, kFcdReductionThreadX);
-        // necessary to avoid dynamic allocation on intermediates;
-        intermediate = out_tv->rFactor({-2});
-      } else {
-        // TODO: we don't need a full warp here, this should be determined by
-        //       element data type
-        out_tv->split(0, kNonFcdReductionThreadX);
-        out_tv->split(
-            -1, kNonFcdReductionThreadY); // necessary to avoid dynamic
-                                          // allocation on intermediates;
-        intermediate = out_tv->rFactor({-2});
-      }
-      for (Val* inp : fusion->inputsOf(output)) {
-        // scheduling of inputs shouldn't change with different fcd_reduction
-        if (inp->getValType().value() == ValType::TensorView) {
-          inp->as<TensorView>()->computeAt(intermediate, -1);
-        }
-      }
-      // scheduling of inputs shouldn't change with different fcd_reduction
-      intermediate->computeAt(out_tv, -2);
-      if (fcd_reduction) {
-        out_tv->axis(0)->parallelize(ParallelType::BIDx);
-      } else {
-        out_tv->axis(0)->parallelize(ParallelType::BIDx);
-        out_tv->axis(1)->parallelize(ParallelType::TIDx);
-      }
-    }
-    // Run through all values, unroll, and bind their axes
-    for (auto val : fusion->vals()) {
-      if (val->getValType().value() != ValType::TensorView ||
-          fusion->hasInput(val))
-        continue;
-      TensorView* tv = val->as<TensorView>();
-      if (fcd_reduction) {
-        tv->axis(-1)->parallelize(ParallelType::TIDx);
-      } else {
-        tv->axis(-1)->parallelize(ParallelType::TIDy);
-      }
-    }
 
-    TensorView* out0 = fusion->outputs()[0]->as<TensorView>();
-    int ndim = (int)out0->nDims();
-    Val* numel = new Int(1);
-    for (int i = 0; i < ndim; i++) {
-      if (out0->axis(i)->isBlockDim()) {
-        numel = mul(numel, out0->axis(i)->rawExtent());
-      }
-    }
-  } else {
-    // Run through outputs, grab all inputs of outputs
-    // squeeze with computeAt to set overall structure.
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-
-      // Split into 128 which will be bockDim.x
-      out_tv->split(0, kPwThreadX);
-      // Split by another 4 which will be our unroll factor
-      auto ur_factor = disable_unroll ? 1 : kUnrollFactor;
-      if (!disable_unroll) {
-        out_tv->split(0, ur_factor);
-      }
+    // Merge all dimensions because we're only supporting pointwise
+    while (out->nDims() > 1) {
+      out->merge(-2, -1);
     }
+  }
 
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-      for (Val* inp : fusion->inputsOf(output)) {
-        if (inp->getValType().value() == ValType::TensorView)
-          inp->as<TensorView>()->computeAt(out_tv, 1);
-      }
-      out_tv->axis(0)->parallelize(ParallelType::BIDx);
+  // Run through outputs, grab all inputs of outputs
+  // squeeze with computeAt to set overall structure.
+  for (auto output : fusion->outputs()) {
+    if (output->getValType() != ValType::TensorView)
+      continue;
+    TensorView* out_tv = output->as<TensorView>();
+
+    // Split into 128 which will be bockDim.x
+    out_tv->split(0, kPwThreadX);
+    // Split by another 4 which will be our unroll factor
+    auto ur_factor = disable_unroll ? 1 : kUnrollFactor;
+    if (!disable_unroll) {
+      out_tv->split(0, ur_factor);
     }
+  }
 
-    // Run through all values, unroll, and bind their axes
-    for (auto val : fusion->vals()) {
-      if (val->getValType().value() != ValType::TensorView ||
-          fusion->hasInput(val))
-        continue;
-      TensorView* tv = val->as<TensorView>();
-
-      // Should be true for all intermediates, but if one isn't hooked
-      // up right, skip it and hope for the best for now
-      if (!disable_unroll && tv->nDims() == 3) {
-        tv->axis(-2)->parallelize(ParallelType::Unroll);
-        tv->axis(-1)->parallelize(ParallelType::TIDx);
-      } else {
-        if (tv->nDims() == 2)
-          tv->axis(-1)->parallelize(ParallelType::TIDx);
-      }
-    }
-    TensorView* out0 = fusion->outputs()[0]->as<TensorView>();
-    int ndim = (int)out0->nDims();
-    Val* numel = new Int(1);
-    for (int i = 0; i < ndim; i++) {
-      if (out0->axis(i)->isBlockDim()) {
-        numel = mul(numel, out0->axis(i)->rawExtent());
-      }
+  for (auto output : fusion->outputs()) {
+    if (output->getValType() != ValType::TensorView)
+      continue;
+    TensorView* out_tv = output->as<TensorView>();
+    for (Val* inp : fusion->inputsOf(output)) {
+      if (inp->getValType().value() == ValType::TensorView)
+        inp->as<TensorView>()->computeAt(out_tv, -1);
     }
+    out_tv->axis(0)->parallelize(ParallelType::BIDx);
+    out_tv->axis(1)->parallelize(ParallelType::Unroll);
+    out_tv->axis(2)->parallelize(ParallelType::TIDx);
   }
+
   return true;
 }
 
 namespace {
-constexpr int ceilDiv(int a, int b) {
-  return (a + b - 1) / b;
-}
-
 // Largest Power of 2 less-than n
 constexpr int lastPow2(int n) {
   n |= (n >> 1);
@@ -243,7 +165,10 @@ ReductionParams reductionHeuristic(
 
   // Is fastest dimension a reduction dimension?
   if (rparams.fastest_dim) {
-    bdimx = red_elems;
+    if (red_elems < rparams.loop_unroll) {
+      rparams.loop_unroll = 1;
+    }
+    bdimx = ceilDiv(red_elems, rparams.loop_unroll);
     bdimy = red_outputs;
   } else {
     bdimx = red_outputs;
@@ -365,21 +290,31 @@ ReductionParams reductionHeuristic(
 }
 } // anonymous namespace
 
-// fusion is the input IR that will be modified by this function
-c10::optional<ReductionParams> scheduleReduction(
+TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& fusion_inputs,
     TensorView* red_tv) {
+  FUSER_PERF_SCOPE("scheduleReduction");
+
   FusionGuard fg(fusion);
 
   if (!fusion->hasReduction()) {
     return c10::nullopt;
   }
+
+  auto red_root_dom = red_tv->getRootDomain();
+  const bool red_on_fastest_dim =
+      red_root_dom[red_root_dom.size() - 1]->isReduction();
+
   TORCH_INTERNAL_ASSERT(
       red_tv != nullptr, "Reduction TensorView wasn't found.");
+
+  if (!fusion->hasReduction()) {
+    return c10::nullopt;
+  }
+
   TORCH_INTERNAL_ASSERT(
       red_tv->hasReduction(), "TensorView doesn't have a reduction.");
-
   const auto red_expr = fusion->origin(red_tv);
 
   TORCH_INTERNAL_ASSERT(
@@ -387,41 +322,48 @@ c10::optional<ReductionParams> scheduleReduction(
           red_expr->getExprType().value() == ExprType::ReductionOp,
       "TensorView doesn't have a reduction.");
 
-  const bool red_on_fastest_dim =
-      red_tv->axis(static_cast<int>(red_tv->nDims()) - 1)->isReduction();
+  StatefulExpressionEvaluator evaluator(
+      executor_utils::statefulBindInputs(fusion_inputs, fusion));
+
+  int64_t red_outputs = 1;
+  int64_t red_elements = 1;
+
+  for (auto id : red_tv->getRootDomain()) {
+    auto inferred_val = evaluator.inferValue(id->rawExtent());
+    TORCH_INTERNAL_ASSERT(
+        inferred_val.has_value(), "Error inferring reduction size.");
+    if (id->isReduction()) {
+      red_elements *= inferred_val.value();
+    } else {
+      red_outputs *= inferred_val.value();
+    }
+  }
+
+  return reductionHeuristic(red_elements, red_outputs, red_on_fastest_dim);
+}
+
+// fusion is the input IR that will be modified by this function
+void scheduleReduction(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    TensorView* red_tv,
+    std::vector<TensorView*> outs_of_red) {
+  FusionGuard fg(fusion);
 
   // We coalesc all reduction axes to the right;
-  const size_t num_reduction_axes = coalescReduction(red_tv);
+  mergeReduction(red_tv);
 
   // Merge all iteration dimensions
-  while (red_tv->nDims() > num_reduction_axes + 1) {
-    red_tv->merge(static_cast<int>(red_tv->nDims() - num_reduction_axes) - 2);
+  mergeNonReduction(red_tv);
+  for (auto iter_tv : outs_of_red) {
+    mergeNonReduction(iter_tv);
   }
-  // Merge all reduction dimensions
-  while (red_tv->nDims() > 2) {
-    red_tv->merge(-2, -1);
-  }
-
-  EvaluationContext eval_context(
-      executor_utils::bindInputs(fusion_inputs, fusion));
 
   // Evaluate Dimensions of Reduction TensorView
   auto red_ids = red_tv->domain()->domain();
+
   TORCH_INTERNAL_ASSERT(
       red_ids.size() == 2, "We coalesced all dimensions into 2 previously.");
-  const auto red_outputs =
-      ExpressionEvaluator::evaluate(red_ids[0]->extent(), &eval_context);
-  const auto red_elems =
-      ExpressionEvaluator::evaluate(red_ids[1]->extent(), &eval_context);
-  TORCH_INTERNAL_ASSERT(
-      red_outputs != c10::nullopt,
-      "The number of reduction outputs is expected.");
-  TORCH_INTERNAL_ASSERT(
-      red_elems != c10::nullopt,
-      "The number of reduction elements is expected.");
-
-  ReductionParams rparams = reductionHeuristic(
-      red_elems.value(), red_outputs.value(), red_on_fastest_dim);
 
   constexpr int kLoopUnrollSplit = 4;
 
@@ -430,36 +372,29 @@ c10::optional<ReductionParams> scheduleReduction(
     // Do multiple reductions per block
     if (rparams.mul_reds_per_blk) {
       // Reduction Splits
-      //      [outputs, |rF-Leftover, rf-Unroll, X-Warp|]
-      // Idx:     0     |   1(-1)       2(-2)    3(-1) |
-      //                --------------------------------
-      //                Reduction Dimensions
-      red_tv->split(1, rparams.lparams.bdimx());
-      red_tv->split(1, kLoopUnrollSplit);
-
-      // Reordering the Unroll dimension eases applying computeAt()
-      // for preceeding operations and the rFactored Tensor.
-      //                               |- Reordered -|
-      //                               V             V
-      //      [outputs, |rF-Leftover, X-Warp, rF-Unroll|]
-      // Idx:     0     |   1(-3)      2(-2)    3(-1)  |
+      //      [outputs, |rF-Leftover, X-Warp, rf-Unroll|]
+      // Idx:     0     |   1(-1)      2(-2)     3(-1) |
       //                --------------------------------
       //                Reduction Dimensions
-      red_tv->reorder({{-1, -2}, {-2, -1}});
+      red_tv->split(1, rparams.loop_unroll);
+      red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
 
       // Output Splits
       //      [|Out-Leftover, Out-PerBlock|, <Reduction Dims>]
       // Idx:  |     0             1      |   2(-2) -- 3(-1)
       //       ----------------------------
       //       Output Dimensions
-      red_tv->split(0, rparams.lparams.bdimy());
+      red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+      }
 
       auto red_tv_rf = red_tv->rFactor({-3, -1});
 
       // WARNING: computeAt will coalesce the rFactored dimensions
       // rFactored Reduction Tensor after computeAt():
-      //      [<output dims>, |X-Warp, rF-Leftover, rF-Unroll|]
-      // Idx:      0 -- 1     | 2(-3)      3(-2)       4(-1)  |
+      //      [<output dims>, | rF-Leftover, X-Warp, rF-Unroll|]
+      // Idx:      0 -- 1     |    2(-3)      3(-2)     4(-1)  |
       //                      ---------------------------------
       //                      Reduction Dimensions
       red_tv_rf->computeAt(red_tv, -1);
@@ -468,11 +403,20 @@ c10::optional<ReductionParams> scheduleReduction(
       // Reduction Output Tensor:
       //      [Out-Leftover, Out-PerBlock, X-Warp]
       // Idx:       0              1       2(-1)
+      if (!outs_of_red.empty()) {
+        red_tv->computeAt(outs_of_red[0], -1);
+      }
 
       red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
       red_tv->axis(0)->parallelize(ParallelType::BIDx);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+      }
       red_tv->axis(1)->parallelize(ParallelType::TIDy);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(1)->parallelize(ParallelType::TIDy);
+      }
       red_tv->axis(-1)->parallelize(ParallelType::TIDx);
 
       // Bind Inputs to Reduction
@@ -485,47 +429,44 @@ c10::optional<ReductionParams> scheduleReduction(
     } else {
       if (rparams.cross_grid) {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Grid, X-Block, X-Warp|]
-        // Idx:     0     |   1(-5)       2(-4)     3(-3)   4(-2)   5(-1) |
+        //      [outputs, |rF-Leftover, X-Grid, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-5)      2(-4)    3(-3)   4(-2)     5(-1) |
         //                -------------------------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimx());
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, rparams.lparams.gdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |------ Reordered --------|
-        //                                 V                         V
-        //      [outputs, |rF-Leftover, X-Warp, X-Grid, X-Block, rf-Unroll|]
-        // Idx:     0     |   1(-5)     2(-4)    3(-3)    4(-2)    5(-1)  |
-        //                -------------------------------------------------
-        //                Reduction Dimensions
-        red_tv->reorder({{-1, -4}, {-4, -1}});
+        red_tv->split(1, rparams.loop_unroll);
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::BIDy));
 
         auto red_tv_rf = red_tv->rFactor(
             {-5, -1}); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Grid, X-Block, rF-Leftover, rF-Unroll|]
-        // Idx:     0     | 1(-5)   2(-4)   3(-3)      4(-2)       5(-1)  |
+        //      [Outputs, |X-Grid, X-Block, X-Warp, rF-Leftover, rF-Unroll|]
+        // Idx:     0     | 1(-5)    2(-4)   3(-3)      4(-2)      5(-1)  |
         //                -------------------------------------------------
         //                Reduction Dimensions
         red_tv_rf->computeAt(red_tv, -1);
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Grid, X-Block]
-        // Idx:     0     1(-3)    2(-2)    3(-1)
+        //      [Outputs, X-Grid, X-Block, X-Warp]
+        // Idx:     0      1(-3)   2(-2)    3(-1)
+
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
-        red_tv->axis(-2)->parallelize(ParallelType::BIDy);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        }
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDy);
+        red_tv->axis(-3)->parallelize(ParallelType::BIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -535,29 +476,19 @@ c10::optional<ReductionParams> scheduleReduction(
         }
       } else {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Block, X-Warp|]
-        // Idx:     0     |   1(-4)       2(-3)     3(-2)   4(-1) |
-        //                -----------------------------------------
-        //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimx());
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |--- Reordered ----|
-        //                                 V                  V
-        //      [outputs, |rF-Leftover, X-Warp, X-Block, rF-Unroll|]
-        // Idx:     0     |   1(-4)      2(-3)   3(-2)     4(-1)  |
+        //      [outputs, |rF-Leftover, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-4)       2(-3)   3(-2)     4(-1) |
         //                -----------------------------------------
         //                Reduction Dimensions
-        red_tv->reorder({{-1, -3}, {-3, -1}});
+        red_tv->split(1, rparams.loop_unroll);
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
 
         auto red_tv_rf = red_tv->rFactor({-4, -1});
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Block, rF-Leftover, rF-Unroll|]
+        //      [Outputs, |X-Block, X-Warp, rF-Leftover, rF-Unroll|]
         // Idx:     0     | 1(-4)   2(-3)      3(-2)       4(-1)  |
         //                -----------------------------------------
         //                Reduction Dimensions
@@ -565,14 +496,21 @@ c10::optional<ReductionParams> scheduleReduction(
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Block]
-        // Idx:     0     1(-2)    2(-1)
+        //      [Outputs, X-Block, X-Warp]
+        // Idx:     0      1(-2)    2(-1)
+
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        }
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -590,8 +528,8 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:     0     |   1(-4)       2(-3)     3(-2)   4(-1) |
         //                -----------------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, rparams.lparams.gdimy());
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::BIDy));
         red_tv->split(1, kLoopUnrollSplit);
 
         // Reordering the Unroll dimension eases applying computeAt()
@@ -609,7 +547,10 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:  |     0             1      |   2(-4) -- 5(-1)
         //       ----------------------------
         //       Output Dimensions
-        red_tv->split(0, rparams.lparams.bdimx());
+        red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        }
 
         auto red_tv_rf = red_tv->rFactor({-4, -1});
 
@@ -626,10 +567,19 @@ c10::optional<ReductionParams> scheduleReduction(
         //      [Out-Leftover, Out-PerBlock, X-Block, X-Grid]
         // Idx:       0              1        2(-2)   3(-1)
 
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
+
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+          iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+        }
+
+        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
         red_tv->axis(-2)->parallelize(ParallelType::TIDy);
         red_tv->axis(-1)->parallelize(ParallelType::BIDy);
 
@@ -645,7 +595,7 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:     0     |   1(-3)       2(-2)     3(-1) |
         //                ---------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimy());
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
         red_tv->split(1, kLoopUnrollSplit);
 
         // Reordering the Unroll dimension eases applying computeAt()
@@ -663,7 +613,10 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:  |     0             1      |   2(-3) -- 4(-1)
         //       ----------------------------
         //       Output Dimensions
-        red_tv->split(0, rparams.lparams.bdimx());
+        red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        }
 
         auto red_tv_rf = red_tv->rFactor({-3, -1});
 
@@ -680,10 +633,18 @@ c10::optional<ReductionParams> scheduleReduction(
         //      [Out-Leftover, Out-PerBlock, X-Block]
         // Idx:       0              1        2(-1)
 
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
+
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+          iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+        }
+        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
         red_tv->axis(-1)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
@@ -694,9 +655,21 @@ c10::optional<ReductionParams> scheduleReduction(
         }
       }
     } else {
-      red_tv->split(0, rparams.lparams.bdimx());
-      red_tv->axis(0)->parallelize(ParallelType::TIDx);
-      red_tv->axis(1)->parallelize(ParallelType::BIDx);
+      red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+      }
+
+      if (!outs_of_red.empty()) {
+        red_tv->computeAt(outs_of_red[0], -1);
+      }
+
+      red_tv->axis(0)->parallelize(ParallelType::BIDx);
+      red_tv->axis(1)->parallelize(ParallelType::TIDx);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+      }
 
       for (auto input : fusion->inputsOf(red_tv)) {
         if (input->getValType().value() == ValType::TensorView) {
@@ -705,8 +678,6 @@ c10::optional<ReductionParams> scheduleReduction(
       }
     }
   }
-
-  return rparams;
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler.h b/torch/csrc/jit/codegen/cuda/scheduler.h
index 2b35b6586f30..5cac9d41f456 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler.h
@@ -15,44 +15,58 @@ TORCH_CUDA_API bool scheduleFusion(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue> inputs);
 
-// Parameters the Reduction Heuristic Generates to describe
-// the optimial schedule
+// Parameters the Reduction Heuristic Generates to describe the optimial
+// schedule. Warning: equal operator is intended for use in caching the kernel
+// associated with these reduction parameteres. It does not check if the launch
+// parameters are equivelent!
 struct ReductionParams {
-  // Reduction Attributes
+  // Reducing inner most dimension?
   bool fastest_dim = true;
+  // Reduce across the block?
   bool cross_block = false;
+  // Reduce across the grid?
   bool cross_grid = false;
+  // Perform multiple reductions per block?
   bool mul_reds_per_blk = false;
+  // Unrolling factor
+  int loop_unroll = 4;
 
   LaunchParams lparams;
 
+  // Warning: Does not check launch parameters!
   bool operator==(const ReductionParams& other) const {
     bool attr_equal = other.fastest_dim == fastest_dim &&
         other.cross_block == cross_block && other.cross_grid == cross_grid &&
-        other.mul_reds_per_blk == mul_reds_per_blk;
-    return attr_equal && lparams == other.lparams;
+        other.mul_reds_per_blk == mul_reds_per_blk &&
+        other.loop_unroll == loop_unroll;
+    return attr_equal;
   }
 };
 
+// Warning: Hash is not based on launch parameters!
 class ReductionParamsHash {
  public:
   size_t operator()(const ReductionParams& rp) const {
-    size_t lp_hash = rp.lparams.gdimx() ^ rp.lparams.gdimy() ^
-        rp.lparams.bdimx() ^ rp.lparams.bdimy();
     constexpr size_t bits = sizeof(std::size_t) * 8;
     size_t attr_hash = static_cast<size_t>(rp.fastest_dim) << (bits - 1) |
         static_cast<size_t>(rp.cross_block) << (bits - 2) |
         static_cast<size_t>(rp.cross_grid) << (bits - 3) |
         static_cast<size_t>(rp.mul_reds_per_blk) << (bits - 4);
-    return lp_hash | attr_hash;
+    return attr_hash;
   }
 };
 
-TORCH_CUDA_API c10::optional<ReductionParams> scheduleReduction(
+TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& fusion_inputs,
     TensorView* red_tv);
 
+TORCH_CUDA_API void scheduleReduction(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    TensorView* red_tv,
+    std::vector<TensorView*> outs_of_red);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/shape_inference.cpp b/torch/csrc/jit/codegen/cuda/shape_inference.cpp
index a247d182bb57..b06d586ec128 100644
--- a/torch/csrc/jit/codegen/cuda/shape_inference.cpp
+++ b/torch/csrc/jit/codegen/cuda/shape_inference.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/shape_inference.h>
 #include <c10/core/ScalarType.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
@@ -148,7 +149,15 @@ class NaiveTypePropagator {
         break;
       }
       case aten::sum: {
-        const auto out_type = node->input(0)->type()->cast<TensorType>();
+        auto out_type = node->input(0)->type()->cast<TensorType>();
+
+        // accept dtype input to `aten::sum` node
+        if (!node->input(3)->type()->isSubtypeOf(
+                static_cast<c10::TypePtr>(NoneType::get()))) {
+          if (auto opt_ivalue = toIValue(node->input(3))) {
+            out_type = out_type->withScalarType(opt_ivalue->toScalarType());
+          }
+        }
         const auto dims = constant_as<c10::List<int64_t>>(node->input(1));
         const auto keepdim = constant_as<bool>(node->input(2));
         TORCH_CHECK(
@@ -234,6 +243,7 @@ class NaiveTypePropagator {
 } // namespace
 
 void TypePropagate(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("TypePropagate");
   NaiveTypePropagator(graph).run();
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
index e8032c51925a..86ff7263af24 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -24,8 +24,8 @@ DataType aten_opt_type_map(const c10::optional<at::ScalarType>& scalar_type) {
 }
 } // namespace
 
-TensorView::TensorView(TensorDomain* _domain, DataType dtype)
-    : Val(ValType::TensorView, dtype), domain_(_domain) {}
+TensorView::TensorView(TensorDomain* _domain, DataType dtype, MemoryType mtype)
+    : Val(ValType::TensorView, dtype), domain_(_domain), memory_type_(mtype) {}
 
 TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
     : Val(ValType::TensorView,
@@ -67,11 +67,6 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
         stride_property_i->contiguous_.has_value() &&
         stride_property_i->contiguous_.value() == true) {
       const size_t index = stride_property_i->stride_index_.value();
-      // TODO: this is a temporary WAR to avoid contiguous_ flag on broadcasted
-      //       dim, which results in wrong indexing math. issue #230
-      if (sizes[index]->isBroadcast()) {
-        continue;
-      }
       if (i == 0) {
         // mark fastest changing dimension collapsible only when it's the last
         // dim;
@@ -81,10 +76,6 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
         if (auto left_index_opt =
                 tensor_type->stride_properties()[static_cast<int>(i) - 1]
                     ->stride_index_) {
-          // TODO: `isBroadcast` -> issue #230
-          if (sizes[left_index_opt.value()]->isBroadcast()) {
-            continue;
-          }
           // collapse if two axes are neighboring in both sizes & stride_index;
           contig_info[index] = (left_index_opt.value() == (index + 1));
         }
@@ -116,6 +107,10 @@ bool TensorView::hasGridReduction() const {
   return domain()->hasGridReduction();
 }
 
+bool TensorView::hasBlockBroadcast() const {
+  return domain()->hasBlockBroadcast();
+}
+
 bool TensorView::hasBroadcast() const {
   return domain()->hasBroadcast();
 }
@@ -562,10 +557,6 @@ void TensorView::setMemoryType(MemoryType mt) {
     TORCH_INTERNAL_ASSERT(
         mt == MemoryType::Global,
         "Tried to set an input or output to the fusion to a non-global memory type.");
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        mt != MemoryType::Global,
-        "Tried to set an intermediate tensor in the fusion to the global memory type.");
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/torch/csrc/jit/codegen/cuda/transform_iter.h
index e3cdab856366..161fa547680e 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.h
+++ b/torch/csrc/jit/codegen/cuda/transform_iter.h
@@ -154,6 +154,8 @@ class TORCH_CUDA_API BestEffortReplay {
   size_t counter = 0;
 
  public:
+  // replay_map: mapping of target root domains to corresponding
+  // replay root domains
   BestEffortReplay(
       const std::vector<IterDomain*>& replay_domain,
       const std::vector<IterDomain*>& target_domain,
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.cpp b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
index b694ba51ad08..8ea00bd28c56 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
@@ -128,6 +129,8 @@ class ReplaySelf : public ReplayTransformations {
 TensorDomain* TransformReplay::fullSelfReplay(
     const TensorDomain* new_self_root,
     const TensorDomain* self) {
+  FUSER_PERF_SCOPE("fullSelfReplay");
+
   TORCH_INTERNAL_ASSERT(
       new_self_root->nDims() == self->getRootDomain().size(),
       "Invalid number of IterDomains provided.");
@@ -181,6 +184,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
     const TensorDomain* producer,
     const TensorDomain* consumer,
     int consumer_compute_at_axis) {
+  FUSER_PERF_SCOPE("replayPasC");
+
   if (consumer_compute_at_axis < 0)
     consumer_compute_at_axis += (int)consumer->nDims() + 1;
   TORCH_INTERNAL_ASSERT(
@@ -353,6 +358,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayCasP(
     const TensorDomain* consumer,
     const TensorDomain* producer,
     int producer_compute_at_axis) {
+  FUSER_PERF_SCOPE("replayCasP");
+
   if (producer_compute_at_axis < 0)
     producer_compute_at_axis += (int)producer->nDims() + 1;
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
index 448122d525f2..27a44a73d7ae 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
@@ -151,6 +152,8 @@ class ReplayRFactor : public ReplayTransformations {
 TensorDomain* TransformRFactor::runReplay(
     TensorDomain* orig_td,
     std::vector<int> axes) {
+  FUSER_PERF_SCOPE("runReplay");
+
   TORCH_CHECK(!axes.empty(), "No axes provided to rfactor replay.");
 
   int ndims = (int)orig_td->nDims();
@@ -300,6 +303,8 @@ TensorDomain* TransformRFactor::runReplay(
 TensorDomain* TransformRFactor::runReplay2(
     TensorDomain* orig_td,
     std::vector<int> axes) {
+  FUSER_PERF_SCOPE("runReplay2");
+
   int ndims = (int)orig_td->nDims();
 
   // Adjust and check provided axes
diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h
index a1f2e412a500..bb60fb2e0d15 100644
--- a/torch/csrc/jit/codegen/cuda/type.h
+++ b/torch/csrc/jit/codegen/cuda/type.h
@@ -13,6 +13,14 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
+struct TypeHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
 // Order of strength
 enum class ValType {
   TensorDomain,
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 08be561aad0d..fdc1e7c3d2fd 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -7,6 +7,11 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+// Common Functions
+constexpr int64_t ceilDiv(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
 // Simple mixin for suppressing copy & move operations, ex:
 //
 //  class Foo : public NonCopyable {
@@ -53,6 +58,16 @@ class PolymorphicBase {
     return downcast_ptr;
   }
 
+  // Check if the runtime time is T (or derived from T)
+  //
+  // NOTE: Don't use this for conditional casts. Use:
+  //
+  //  if (auto t = dynamic_cast<T>(p)) { ... }
+  //
+  // instead of:
+  //
+  //  if (p->isA<T>()) { auto t = p->as<T>(); ... }
+  //
   template <class T>
   bool isA() const {
     return dynamic_cast<const T*>(this) != nullptr;
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 690e52d7131d..fce8cd314c49 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/annotate_warns.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -2189,13 +2190,13 @@ struct to_ir {
   NamedValue emitValueToTensor(
       const NamedValue& value,
       const NamedValue& matchTypeOf) {
-    // Add implicit conversion of int/float/bool types to tensors
+    // Add implicit conversion of int/float/bool/number types to tensors
     // Used in emitSubscriptAssign to convert:
     //   `tensor(...)[x] = 99` to `tensor(...)[x] = tensor(99)`
     // Mirrors the `valueToTensor` behavior in python_variable_indexing.cpp
     const auto kind = value.type()->kind();
-    if (kind == c10::TypeKind::IntType || kind == c10::TypeKind::BoolType ||
-        kind == c10::TypeKind::FloatType) {
+    if (kind == c10::TypeKind::NumberType || kind == c10::TypeKind::IntType ||
+        kind == c10::TypeKind::BoolType || kind == c10::TypeKind::FloatType) {
       auto dtype = graph->insert(prim::dtype, {matchTypeOf}, {});
       auto device = graph->insert(prim::device, {matchTypeOf}, {});
       auto converted = graph->insert(
@@ -4090,6 +4091,10 @@ void runCleanupPasses(std::shared_ptr<Graph>& to_clean) {
 
   // For jitter
   CanonicalizeOutputs(to_clean);
+
+  // Annotate aten::warns so that each has its unique ID. This enables us to
+  // mimic Python behavior of only emitting each warning only once.
+  AnnotateWarns(to_clean);
 }
 
 // we consider _N where N is a number, to be a non-meaningful name
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index f78dd7a7d11b..3a83d8b9a87f 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -111,7 +111,8 @@ namespace jit {
   _(TK_WITH, "with", "with")                     \
   _(TK_WITH_ITEM, "withitem", "")                \
   _(TK_AS, "as", "as")                           \
-  _(TK_PROP, "property", "")
+  _(TK_PROP, "property", "")                     \
+  _(TK_ELLIPSIS, "Ellipsis", "Ellipsis")
 
 enum TokenKind {
   // we use characters to represent themselves so skip all valid characters
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index 66c75d8a499d..c9f4aac038cc 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -167,6 +167,10 @@ struct ParserImpl {
         prefix = Dots::create(L.cur().range);
         L.next();
       } break;
+      case TK_ELLIPSIS: {
+        prefix = Dots::create(L.cur().range);
+        L.next();
+      } break;
       default: {
         Ident name = parseIdent();
         prefix = Var::create(name.range(), name);
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index c4f749271a53..b4bc481ae5bd 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -696,15 +696,6 @@ void addInputs(
   }
 }
 
-void addInputs(Node* n, const char* name, const at::TensorOptions& options) {
-  // [TensorOptions in script] - update this when you change how we schematize
-  // TensorOptions
-  addInputs(n, name, options.dtype_opt());
-  addInputs(n, name, options.layout());
-  addInputs(n, name, options.device());
-  addInputs(n, name, options.pinned_memory());
-}
-
 void addInputs(Node* n, const char* name, at::IntArrayRef value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   std::vector<Value*> info = ArgumentStash::hasIntArrayRef(name)
@@ -921,6 +912,18 @@ void setRecordSourceLocation(void (*v)(Node*)) {
   record_source_location.store(v);
 }
 
+std::vector<StackEntry> defaultPythonCallstack() {
+  return std::vector<StackEntry>();
+}
+std::atomic<decltype(&defaultPythonCallstack)> python_callstack_fn(
+    defaultPythonCallstack);
+std::vector<StackEntry> pythonCallstack() {
+  return python_callstack_fn.load()();
+}
+void setPythonCallstack(std::vector<StackEntry> (*v)()) {
+  python_callstack_fn.store(v);
+}
+
 void defaultWarn(const std::string& str) {
   TORCH_WARN(str);
 }
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 82ce500c532c..74a5225d4f3f 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -7,6 +7,7 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/api/object.h>
+#include <torch/csrc/jit/frontend/source_range.h>
 #include <torch/csrc/utils/variadic.h>
 
 #include <cstdint>
@@ -193,6 +194,9 @@ struct WithNestedTracingFrame {
 TORCH_API void recordSourceLocation(Node* n);
 TORCH_API void setRecordSourceLocation(void (*v)(Node*));
 
+TORCH_API std::vector<StackEntry> pythonCallstack();
+TORCH_API void setPythonCallstack(std::vector<StackEntry> (*v)());
+
 // Having finished adding a new 'node' to the graph IR 'setValueTrace'
 // associates this node with an output variable, so that further operations
 // involving this variable know which node in the IR to reference.
@@ -266,10 +270,6 @@ TORCH_API void addInputs(
     Node* n,
     const char* name,
     const c10::optional<std::string>& value);
-TORCH_API void addInputs(
-    Node* n,
-    const char* name,
-    const at::TensorOptions& value);
 TORCH_API void addInputs(Node* n, const char* name, at::Device value);
 TORCH_API void addInputs(Node* n, const char* name, at::Layout value);
 TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value);
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 50b84d8f6405..bb5872f35f4f 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/utils/memory.h>
 
@@ -298,15 +299,10 @@ void AliasDb::getReadsImpl(Node* n, MemoryLocations& ret) const {
     auto it = elementMap_.find(input);
     if (it != elementMap_.end()) {
       auto el = it->second;
-      // Add all memory locations this element may alias.
-      ret |= memoryDAG_->getMemoryLocations(el);
 
-      // We also consider memory locations of contained values to be "read".
-      for (const auto& type : input->type()->containedTypes()) {
-        if (auto wildcard = getWildcard(type)) {
-          ret |= memoryDAG_->getMemoryLocations(wildcard);
-        }
-      }
+      // Add all memory locations this element may alias and their contained
+      // elements
+      memoryDAG_->collectAllContainedMemoryLocations(el, ret);
     }
   }
 
@@ -878,6 +874,44 @@ void AliasDb::analyzeConservative(Node* node) {
   }
 }
 
+bool AliasDb::functionalNonEscapingListUse(const Use& use) const {
+  Node* n = use.user;
+  size_t offset = use.offset;
+  Value* container = n->inputs().at(offset);
+
+  // only consider aten op uses of lists
+  if (!container->type()->cast<ListType>()) {
+    return false;
+  }
+
+  /*
+  in the general case, we consider any Value that enters another container as
+  entering the heap, and thus aliasing all other heap values of the same type.
+  the advantage of this approach are:
+  - there are many composite list/container ops that would be tricky to
+  schematize if we did something more complicated
+  - limits the size of the AliasDb, because a container of size 10 only contains
+  1 memory dag element instead of 10
+  - we do not need to worry about adding contained elements to the wildcard set
+  when a container escapes the graph.
+  The downside of this approach is we are unable to handle the common case of a
+  list constructed and passed into an aten op. Here, optimize for a set of
+  common ops where the output does not alias the list or the list elements
+  */
+
+  switch (use.user->kind()) {
+    case aten::cat:
+    case aten::broadcast_tensors:
+    case aten::stack:
+    case aten::vstack:
+    case aten::hstack:
+    case aten::dstack:
+      return true;
+  }
+
+  return false;
+}
+
 // List or dict or tuple: construct: create an aliasing element for the actual
 // container, then mark all inputs as wildcards, since they've gone inside the
 // container. Then, add the wildcard sets of appropriate type to the contained
@@ -895,6 +929,20 @@ void AliasDb::analyzeContainerConstruct(Node* node) {
 
   TORCH_INTERNAL_ASSERT(node->outputs().size() == 1);
   auto container = node->output();
+
+  // optimization:
+  // if a list is only used once in an aten op, and the op output
+  // doesn't alias the input, then we can add all inputs to the list's
+  // contained elements instead of the wildcard set.
+  if (container->uses().size() == 1 &&
+      functionalNonEscapingListUse(container->uses().at(0))) {
+    giveFreshAlias(container, false);
+    for (Value* v : node->inputs()) {
+      addToContainedElements(v, container);
+    }
+    return;
+  }
+
   giveFreshAlias(container);
   auto container_elem = elementMap_.at(container);
   for (auto input : node->inputs()) {
@@ -1068,7 +1116,9 @@ void AliasDb::createValue(const Value* value) {
   elementMap_[value] = new_elem;
 }
 
-void AliasDb::giveFreshAlias(const Value* value) {
+void AliasDb::giveFreshAlias(
+    const Value* value,
+    bool add_wildcard_to_contained_elems) {
   auto maybe_mut_type = getMutableTypePtr(value->type());
   if (!maybe_mut_type) {
     return;
@@ -1082,7 +1132,9 @@ void AliasDb::giveFreshAlias(const Value* value) {
 
   auto new_elem = memoryDAGBuilder_->makeFreshValue(value);
   elementMap_[value] = new_elem;
-  addContainedTypesToFreshElement(new_elem, *maybe_mut_type);
+  if (add_wildcard_to_contained_elems) {
+    addContainedTypesToFreshElement(new_elem, *maybe_mut_type);
+  }
 }
 
 Element* AliasDb::getOrCreateElement(const Value* value) {
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index e3e69185891f..b20654b1f6b9 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -205,10 +205,13 @@ class AliasDb {
       const Value* element,
       const Value* container);
   void mapAliases(at::ArrayRef<Value*> to, at::ArrayRef<Value*> from);
-  void giveFreshAlias(const Value* value);
+  void giveFreshAlias(
+      const Value* value,
+      bool add_wildcard_to_contained_elems = true);
   Element* getOrCreateElement(const Value* value);
 
   c10::optional<TypePtr> getMutableTypePtr(const TypePtr& type) const;
+  bool functionalNonEscapingListUse(const Use& use) const;
 
   bool isContainerType(const TypePtr& type) const;
 
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 6543f36d6ac2..e0b7e15556eb 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -326,6 +326,8 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
     out << "with " << fg->kind().toQualString() << "_" << i++ << " = "
         << *fg->g(attr::Subgraph);
   }
+  out.flush();
+
   /*
   // Uncomment this to debug all_nodes issues
   {
@@ -844,22 +846,40 @@ void Value::replaceAllUsesAfterNodeWith(const Node* node, Value* newValue) {
       uses_.end());
 }
 
-size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
-  auto name_str = name.toUnqualString();
+size_t findArgument(
+    const FunctionSchema& the_schema,
+    const std::string& unqualName) {
   for (size_t i = 0; i < the_schema.arguments().size(); ++i) {
     const Argument* arg = &the_schema.arguments()[i];
-    if (arg->name() == name_str) {
+    if (arg->name() == unqualName) {
       return i;
     }
   }
   throw std::runtime_error(
-      std::string("Couldn't find an argument called ") + name.toQualString());
+      std::string("Couldn't find an argument called ") + unqualName);
+}
+
+size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
+  const auto unqualName = name.toUnqualString();
+  return findArgument(the_schema, unqualName);
 }
 
 c10::optional<IValue> Node::get(Symbol name) const {
   return toIValue(namedInput(name));
 }
 
+bool Node::hasNamedInput(const std::string& name) const {
+  for (const auto& argument : schema().arguments()) {
+    if (argument.name() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Value* Node::namedInput(const std::string& unqualName) const {
+  return input(findArgument(schema(), unqualName));
+}
 Value* Node::namedInput(Symbol name) const {
   return input(findArgument(schema(), name));
 }
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 665bd9797b26..dbd9fb5ca755 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -414,6 +414,8 @@ struct TORCH_API Node {
     return inputs_.at(i);
   }
 
+  bool hasNamedInput(const std::string& unqualName) const;
+  Value* namedInput(const std::string& unqualName) const;
   Value* namedInput(Symbol name) const;
 
   c10::optional<IValue> get(Symbol name) const;
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index e812fd978c9f..c7f41b902ad6 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -228,6 +228,8 @@ class BytecodeDeserializer final {
  public:
   explicit BytecodeDeserializer(std::unique_ptr<PyTorchStreamReader> reader);
   mobile::Module deserialize(c10::optional<at::Device> device);
+  std::unordered_map<std::string, std::string> deserializeMetadata(
+      c10::optional<at::Device> device);
 
  private:
   c10::IValue readArchive(
@@ -246,6 +248,13 @@ BytecodeDeserializer::BytecodeDeserializer(
     : compilation_unit_(std::make_shared<CompilationUnit>()),
       reader_(std::move(reader)) {}
 
+std::unordered_map<std::string, std::string> BytecodeDeserializer::
+    deserializeMetadata(c10::optional<at::Device> device) {
+  device_ = device;
+  auto mcu = std::make_shared<mobile::CompilationUnit>();
+  return readMobileMetadata(mcu);
+}
+
 mobile::Module BytecodeDeserializer::deserialize(
     c10::optional<at::Device> device) {
   device_ = device;
@@ -394,12 +403,13 @@ mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   if (observer) {
-    observer->onEnterLoadModel();
+    observer->onEnterLoadModel(instance_key);
   }
+  auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
+  BytecodeDeserializer deserializer(std::move(reader));
   try {
-    auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
-    BytecodeDeserializer deserializer(std::move(reader));
     mobile::Module result = deserializer.deserialize(std::move(device));
     std::unordered_map<std::string, std::string> copied_metadata =
         result.metadata();
@@ -407,12 +417,15 @@ mobile::Module _load_for_mobile(
       copied_metadata["model_name"] = result.name();
     }
     if (observer) {
-      observer->onExitLoadModel(copied_metadata);
+      observer->onExitLoadModel(instance_key, copied_metadata);
     }
     return result;
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailLoadModel(error.what());
+      observer->onFailLoadModel(
+          instance_key,
+          error.what(),
+          deserializer.deserializeMetadata(std::move(device)));
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -429,7 +442,10 @@ mobile::Module _load_for_mobile(
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailLoadModel(error.what());
+        observer->onFailLoadModel(
+            instance_key,
+            error.what(),
+            deserializer.deserializeMetadata(std::move(device)));
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index a1dacef29398..6ded78b1f56d 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -171,8 +171,9 @@ mobile::Module _load_data(
     std::unique_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   if (observer) {
-    observer->onEnterLoadModel();
+    observer->onEnterLoadModel(instance_key);
   }
   try {
     auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
@@ -186,12 +187,12 @@ mobile::Module _load_data(
       copied_metadata["model_name"] = result.name();
     }
     if (observer) {
-      observer->onExitLoadModel(copied_metadata);
+      observer->onExitLoadModel(instance_key, copied_metadata);
     }
     return result;
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailLoadModel(error.what());
+      observer->onFailLoadModel(instance_key, error.what());
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -208,7 +209,7 @@ mobile::Module _load_data(
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailLoadModel(error.what());
+        observer->onFailLoadModel(instance_key, error.what());
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index fc8cde35aabf..4fd2c94bbf1a 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -121,15 +121,17 @@ Method::Method(const Module* owner, Function* function)
 
 void Method::run(Stack& stack) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   /* if the metadata dict doesn't contain "model_name", copy the metadata and
   set the value of "model_name" as name() */
   std::unordered_map<std::string, std::string> copied_metadata =
       owner_->metadata();
   if (owner_->metadata().find("model_name") == owner_->metadata().end()) {
-    copied_metadata["model_name"] = name();
+    copied_metadata["model_name"] = owner_->name();
   }
   if (observer) {
-    observer->onEnterRunMethod(copied_metadata, function_->name());
+    observer->onEnterRunMethod(
+        copied_metadata, instance_key, function_->name());
   }
 
   auto debug_info = std::make_shared<MobileDebugInfo>();
@@ -142,11 +144,11 @@ void Method::run(Stack& stack) {
     stack.insert(stack.begin(), owner_->_ivalue());
     function_->run(stack);
     if (observer) {
-      observer->onExitRunMethod();
+      observer->onExitRunMethod(instance_key);
     }
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailRunMethod(error.what());
+      observer->onFailRunMethod(instance_key, error.what());
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -163,7 +165,7 @@ void Method::run(Stack& stack) {
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailRunMethod(error.what());
+        observer->onFailRunMethod(instance_key, error.what());
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index be66bd84ef75..00a2005df8d5 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -43,7 +43,7 @@ class TORCH_API Module {
     return get_method("forward")(std::move(inputs));
   }
   c10::optional<Method> find_method(const std::string& basename) const;
-  std::string name() {
+  const std::string name() const {
     return object_->name();
   }
   const std::vector<at::IValue>& slots() const {
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index fde99f501f72..6ec2806d83b4 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -70,14 +70,19 @@ class MobileModuleObserver {
 
   virtual void onEnterRunMethod(
       const std::unordered_map<std::string, std::string>&,
+      const int32_t,
       const std::string&) {}
-  virtual void onExitRunMethod() {}
-  virtual void onCancelRunMethod(const std::string&) {}
-  virtual void onFailRunMethod(const char*) {}
-  virtual void onEnterLoadModel() {}
+  virtual void onExitRunMethod(const int32_t) {}
+  virtual void onFailRunMethod(const int32_t, const char*) {}
+  virtual void onEnterLoadModel(const int32_t) {}
   virtual void onExitLoadModel(
+      const int32_t,
+      const std::unordered_map<std::string, std::string>&) {}
+  virtual void onFailLoadModel(const int32_t, const char*) {}
+  virtual void onFailLoadModel(
+      const int32_t,
+      const char*,
       const std::unordered_map<std::string, std::string>&) {}
-  virtual void onFailLoadModel(const char*) {}
 };
 
 class MobileObserverConfig {
diff --git a/torch/csrc/jit/passes/annotate_warns.cpp b/torch/csrc/jit/passes/annotate_warns.cpp
new file mode 100644
index 000000000000..3e0dc9faa1c1
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_warns.cpp
@@ -0,0 +1,29 @@
+#include <torch/csrc/jit/passes/annotate_warns.h>
+
+#include <atomic>
+
+namespace torch {
+namespace jit {
+
+void AnnotateWarns(Block* b) {
+  static std::atomic<int64_t> idx(0);
+  for (Node* n : b->nodes()) {
+    for (Block* child_b : n->blocks()) {
+      AnnotateWarns(child_b);
+    }
+
+    if (n->kind() != aten::warn) {
+      continue;
+    }
+
+    n->i_(attr::warn_id, idx);
+    idx++;
+  }
+}
+
+void AnnotateWarns(const std::shared_ptr<Graph>& graph) {
+  AnnotateWarns(graph->block());
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/annotate_warns.h b/torch/csrc/jit/passes/annotate_warns.h
new file mode 100644
index 000000000000..18e9f67641e0
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_warns.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void AnnotateWarns(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
index 11bee519292c..6ac510b13777 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -13,66 +13,6 @@ namespace jit {
 
 namespace {
 
-std::vector<c10::optional<const Use>> gatherLastUses(
-    at::ArrayRef<Value*> values) {
-  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
-    return firstOrLastUse(v, /*find_first*/ false);
-  });
-}
-
-// When merging a node into a subgraph, we wish to preserve all of the
-// aliasing properties of the node's outputs. It is difficult to track
-// the node or its contained nodes through all of the ir manipulation
-// involved in merging; it is pretty easy to uniquely identify the value
-// based on its uses. We can identify the value by its last use in the graph.
-// Values which do not have uses or which do not have a last use
-// outside of the subgraph to be merged into we do not need to track.
-struct ValueMapper {
-  ValueMapper(Node* n, AliasDb& db, size_t subgraph_num_outputs) {
-    last_uses_ = gatherLastUses(n->outputs());
-    subgraph_num_outputs_ = subgraph_num_outputs;
-    WithInsertPoint guard(n);
-    auto g = n->owningGraph();
-    // temporary node to put the aliasing properties of the node before its
-    // merged and destroyed
-    placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0));
-    for (size_t i = 0; i < n->outputs().size(); ++i) {
-      Value* existing = n->outputs().at(i);
-      Value* new_value =
-          placeholder_node_->insertOutput(i)->copyMetadata(n->outputs().at(i));
-      db.replaceWithNewValue(existing, new_value);
-    }
-  }
-
-  bool usesEqual(const Use& a, const Use& b) {
-    return a.user == b.user && a.offset == b.offset;
-  }
-
-  void copyAliasing(Node* merged_node, AliasDb& db) {
-    auto num_outputs = merged_node->outputs().size();
-    auto new_outputs = merged_node->outputs().slice(
-        subgraph_num_outputs_, num_outputs - subgraph_num_outputs_);
-    for (Value* v : new_outputs) {
-      auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false);
-      // if it doesnt have a use it shouldnt have been added as output
-      TORCH_INTERNAL_ASSERT(maybe_last_use);
-      const Use last_use = *maybe_last_use;
-      size_t i = 0;
-      while (i < last_uses_.size() && last_uses_.at(i).has_value() &&
-             !usesEqual(*last_uses_.at(i), last_use)) {
-        ++i;
-      }
-      TORCH_INTERNAL_ASSERT(i != last_uses_.size());
-      db.replaceWithNewValue(placeholder_node_->outputs().at(i), v);
-    }
-    placeholder_node_->destroy();
-  }
-
-  std::vector<c10::optional<const Use>> last_uses_;
-  size_t subgraph_num_outputs_;
-  Node* placeholder_node_;
-};
-
 struct WorkBlock : public std::pair<Node*, Node*> {
   using pair::pair;
 
@@ -285,11 +225,8 @@ class SubgraphSlicer {
   std::pair<graph_node_list::iterator, bool> scanNode(Node* consumer) {
     if (shouldConsiderForMerge(consumer)) {
       if (consumer->kind() != prim::DifferentiableGraph) {
-        // ValueMapper preserves the aliasing information of the node's outputs
-        ValueMapper vm(consumer, aliasDb_, 0);
-        consumer = SubgraphUtils::createSingletonSubgraph(
-            consumer, prim::DifferentiableGraph);
-        vm.copyAliasing(consumer, aliasDb_);
+        consumer = SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
+            consumer, prim::DifferentiableGraph, aliasDb_);
       }
       auto inputs = sortReverseTopological(consumer->inputs());
       for (auto input : inputs) {
@@ -315,10 +252,8 @@ class SubgraphSlicer {
       return c10::nullopt;
     }
 
-    ValueMapper vm(producer, aliasDb_, consumer->outputs().size());
-    SubgraphUtils::mergeNodeIntoSubgraph(producer, consumer);
-    vm.copyAliasing(consumer, aliasDb_);
-
+    SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing(
+        producer, consumer, aliasDb_);
     return consumer;
   }
 
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 4e95c9af40e3..6aefa467f49b 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -42,7 +42,14 @@ class AttributePropagator {
     // explicitly.
     auto checkName = [this](std::string& name) {
       if (module_.hasattr(name)) {
-        insertMutableAttr(name, module_.attr(name), module_._ivalue());
+        auto attr = module_.attr(name);
+
+        // Freezing client wants to presever this submodule. When cleaning
+        // the frozen module, make sure it will be preserved entirely.
+        if (attr.isModule()) {
+          preservedSubModule_.insert(attr.toModule()._ivalue());
+        }
+        insertMutableAttr(name, attr, module_._ivalue());
         return true;
       }
 
@@ -97,12 +104,7 @@ class AttributePropagator {
       auto graph = function->graph();
       optimizeSubGraphs(graph, applyInline);
       if (freezeInterfaces_) {
-        optimizeSubGraphs(
-            graph,
-            std::bind(
-                &AttributePropagator::inlineInterfaceCalls,
-                *this,
-                std::placeholders::_1));
+        inlineInterfaceCalls(graph);
       }
       // Record Attributes that are explicitly set in the module.
       // They cannot be folded.
@@ -379,6 +381,14 @@ class AttributePropagator {
           inlineInterfaceCall(n, attr);
           // Reset the GetAttr to concrete module type.
           n->output()->setType(attr.type());
+        } else if (n->kind() == prim::fork) {
+          applyToForkSubgraph(
+              n,
+              graph,
+              std::bind(
+                  &AttributePropagator::inlineInterfaceCalls,
+                  *this,
+                  std::placeholders::_1));
         }
       }
     }
@@ -476,18 +486,20 @@ class AttributePropagator {
     auto node = n->inputs()[0]->node();
     // Check if first parameter of fork is a module. This module is used
     // as the base module (similar to 'self' in forward) to resolve GetAttrs.
-    if (node->kind() != prim::GetAttr) {
-      return;
-    }
-    auto name = node->s(attr::name);
-    auto input = node->inputs()[0];
-    if (!findConstantAttr(input, name, attrModule, graph)) {
-      // Module needs to be preserved.
-      return;
+    //  Otherwise freezing is applied using module_
+    if (node->kind() == prim::GetAttr &&
+        node->output()->type()->cast<ClassType>()) {
+      auto name = node->s(attr::name);
+      auto input = node->inputs()[0];
+      if (!findConstantAttr(input, name, attrModule, graph)) {
+        // Module needs to be preserved.
+        return;
+      }
+      attrModule = attrModule.attr(name).toModule();
+      std::swap(module_, attrModule);
     }
-    attrModule = attrModule.attr(name).toModule();
+
     auto subgraph = n->g(attr::Subgraph);
-    std::swap(module_, attrModule);
     func(subgraph);
     module_ = attrModule;
   }
@@ -498,7 +510,32 @@ class AttributePropagator {
         return true;
       }
     }
-    return false;
+    return preservedSubModule_.count(subModule._ivalue());
+  }
+
+  void removeExtraWaitCalls(Block* b) {
+    auto nodes = b->nodes();
+    for (auto it = nodes.begin(); it != nodes.end(); it++) {
+      auto node = *it;
+      if (node->kind() != aten::wait) {
+        continue;
+      }
+      TORCH_INTERNAL_ASSERT(node->inputs().size() == 1);
+      TORCH_INTERNAL_ASSERT(node->outputs().size() == 1);
+      // If input type is not a from aten::fork call then the
+      // aten::wait operator can be deleted.
+      if (node->input()->type()->kind() != TypeKind::FutureType) {
+        node->output()->replaceAllUsesWith(node->input());
+        it.destroyCurrent();
+      }
+    }
+    // For the remaining nodes, recurse.
+    for (auto it = nodes.begin(); it != nodes.end(); it++) {
+      auto node = *it;
+      for (auto sub_b : node->blocks()) {
+        removeExtraWaitCalls(sub_b);
+      }
+    }
   }
 
   // cleanupFrozenModule function cleans up the Frozen module. It performs the
@@ -511,6 +548,7 @@ class AttributePropagator {
       auto graph = function->graph();
       recordReferencedAttrs(graph);
       handleSharedClassType(module_, graph);
+      removeExtraWaitCalls(graph->block());
     }
     removeUnusedAttrs();
   }
@@ -652,6 +690,9 @@ class AttributePropagator {
   // Contains user specified methods to be preserved in frozen module.
   std::unordered_set<Function*> preservedMethods_;
 
+  // Contains user specified sub module to be preserve in frozen module.
+  std::unordered_set<ModulePtr> preservedSubModule_;
+
   // Track all used attributes ivalues that can be aliased.
   IValue::HashAliasedIValues usedAttrs_;
 
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 07634bfc5200..027daba912cc 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
@@ -120,16 +121,6 @@ bool isSimpleMap(Node* node) {
   return true;
 }
 
-Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
-  AT_ASSERT(!sizes.empty());
-  Graph* graph = sizes[0]->owningGraph();
-  Node* broadcast_n =
-      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
-  broadcast_n->output()->setType(ListType::ofInts());
-  db->createValue(broadcast_n->output());
-  return broadcast_n->output();
-}
-
 struct GraphFuser {
   using FusionCallback = std::function<bool(GraphFuser*, Node*)>;
 
@@ -926,13 +917,6 @@ struct GraphFuser {
     }
   }
 
-  bool usedOnlyInSize(Value* v) {
-    const auto& uses = v->uses();
-    return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
-      return u.user->matches("aten::size(Tensor self) -> int[]");
-    });
-  }
-
   // Builds up expressions that compute shapes of all intermediates (and
   // outputs) of the fusion group, based on the sizes of inputs. You should run
   // DCE to remove those that you end up not using.
diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.cpp b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
index 6a6c0a2c355c..8b612a902b12 100644
--- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
@@ -84,43 +84,51 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         %r = aten::conv2d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv_transpose2d_for_deprecated_conv = R"(
+  std::string conv1d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
-  std::string conv_transpose2d = R"(
+  std::string conv1d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv1d_for_deprecated_conv = R"(
+  std::string conv3d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
-  std::string conv1d = R"(
+  std::string conv3d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv3d_for_deprecated_conv = R"(
+  std::string conv_transpose1d = R"(
+      graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
+          %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
+          %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
+        %r = aten::conv_transpose1d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
         return (%r) )";
-  std::string conv3d = R"(
+
+  std::string conv_transpose2d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
         return (%r) )";
 
   // Filter the unsupported case
@@ -146,6 +154,29 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
     }
     return !calc_value_map["transposed"].toBool();
   };
+  auto filter_conv3d = [](const Match& match,
+                          const std::unordered_map<std::string, Value*>& vmap) {
+    auto calc_value_map = getConvParams(match, vmap);
+    if (calc_value_map["output_padding"].toIntList().size() != 3 ||
+        calc_value_map["stride"].toIntList().size() != 3 ||
+        calc_value_map["padding"].toIntList().size() != 3 ||
+        calc_value_map["dilation"].toIntList().size() != 3) {
+      return false;
+    }
+    return !calc_value_map["transposed"].toBool();
+  };
+  auto filter_conv_transpose1d =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        auto calc_value_map = getConvParams(match, vmap);
+        if (calc_value_map["output_padding"].toIntList().size() != 1 ||
+            calc_value_map["stride"].toIntList().size() != 1 ||
+            calc_value_map["padding"].toIntList().size() != 1 ||
+            calc_value_map["dilation"].toIntList().size() != 1) {
+          return false;
+        }
+        return calc_value_map["transposed"].toBool();
+      };
   auto filter_conv_transpose2d =
       [](const Match& match,
          const std::unordered_map<std::string, Value*>& vmap) {
@@ -158,39 +189,36 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         }
         return calc_value_map["transposed"].toBool();
       };
-  auto filter_conv3d = [](const Match& match,
-                          const std::unordered_map<std::string, Value*>& vmap) {
-    auto calc_value_map = getConvParams(match, vmap);
-    if (calc_value_map["output_padding"].toIntList().size() != 3 ||
-        calc_value_map["stride"].toIntList().size() != 3 ||
-        calc_value_map["padding"].toIntList().size() != 3 ||
-        calc_value_map["dilation"].toIntList().size() != 3) {
-      return false;
-    }
-    return !calc_value_map["transposed"].toBool();
-  };
 
   SubgraphRewriter rewriter_conv1d;
   rewriter_conv1d.RegisterRewritePattern(convolution, conv1d);
   rewriter_conv1d.RegisterRewritePattern(
       convolution_deprecated, conv1d_for_deprecated_conv);
   rewriter_conv1d.runOnGraph(graph, filter_conv1d);
+
   SubgraphRewriter rewriter_conv2d;
   rewriter_conv2d.RegisterRewritePattern(convolution, conv2d);
   rewriter_conv2d.RegisterRewritePattern(
       convolution_deprecated, conv2d_for_deprecated_conv);
   rewriter_conv2d.runOnGraph(graph, filter_conv2d);
+
+  SubgraphRewriter rewriter_conv3d;
+  rewriter_conv3d.RegisterRewritePattern(convolution, conv3d);
+  rewriter_conv3d.RegisterRewritePattern(
+      convolution_deprecated, conv3d_for_deprecated_conv);
+  rewriter_conv3d.runOnGraph(graph, filter_conv3d);
+
+  SubgraphRewriter rewriter_conv_transpose1d;
+  rewriter_conv_transpose1d.RegisterRewritePattern(
+      convolution, conv_transpose1d);
+  rewriter_conv_transpose1d.runOnGraph(graph, filter_conv_transpose1d);
+
   SubgraphRewriter rewriter_conv_transpose2d;
   rewriter_conv_transpose2d.RegisterRewritePattern(
       convolution, conv_transpose2d);
   rewriter_conv_transpose2d.RegisterRewritePattern(
       convolution_deprecated, conv_transpose2d_for_deprecated_conv);
   rewriter_conv_transpose2d.runOnGraph(graph, filter_conv_transpose2d);
-  SubgraphRewriter rewriter_conv3d;
-  rewriter_conv3d.RegisterRewritePattern(convolution, conv3d);
-  rewriter_conv3d.RegisterRewritePattern(
-      convolution_deprecated, conv3d_for_deprecated_conv);
-  rewriter_conv3d.runOnGraph(graph, filter_conv3d);
 }
 
 bool isClampFusable(
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 6db3f00562ba..75ad2a4499ce 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -10,7 +10,7 @@ namespace {
 static const std::unordered_map<Symbol, Symbol> alias_map = {
     {aten::absolute, aten::abs},     {aten::absolute_, aten::abs_},
     {aten::clip, aten::clamp},       {aten::clip_, aten::clamp_},
-    {aten::linalg_det, aten::det},   {aten::outer, aten::ger},
+    {aten::linalg_det, aten::det},   {aten::ger, aten::outer},
     {aten::arccos, aten::acos},      {aten::arccos_, aten::acos_},
     {aten::arcsin, aten::asin},      {aten::arcsin_, aten::asin_},
     {aten::arctan, aten::atan},      {aten::arctan_, aten::atan_},
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index ce9ce5fb37c4..c6c8eceb8f45 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -82,33 +82,6 @@ bool IsErasableSequence(const Node* loop_node, size_t i) {
   return true;
 }
 
-void FixupONNXLoopNodeInputs(Node* node) {
-  if (node->kind() != ::c10::onnx::Loop) {
-    return;
-  }
-
-  auto* graph = node->owningGraph();
-
-  // add cast to condition input outside the loop.
-  Value* cond_val = node->inputs()[1];
-  if (IsCondCastRequired(cond_val))
-    InsertCastForCond(cond_val, graph, node);
-
-  // Setup Loop input cond and i.
-  TORCH_INTERNAL_ASSERT(node->blocks().size() == 1);
-  auto* sub_block = node->blocks()[0];
-  Value* cond = sub_block->insertInput(1, "cond");
-  cond->setType(BoolType::create());
-
-  Value* i = sub_block->inputs()[0];
-  i->setType(TensorType::fromNumberType(IntType::get()));
-
-  // add cast to condition input inside the loop.
-  Value* next_cond_val = sub_block->outputs()[0];
-  if (IsCondCastRequired(next_cond_val))
-    InsertCastForCond(next_cond_val, graph, sub_block->return_node());
-}
-
 // ONNX::Loop does not support Sequence type as loop-carried dependencies. Only
 // tensors are supported. This pass converts Sequence loop-carried dependencies
 // to scan_outputs. In opset 11, only the below pattern is supported.
@@ -218,6 +191,33 @@ void ConvertSequenceDependencies(Block* block, int opset_version) {
 }
 } // anonymous namespace
 
+void FixupONNXLoopNodeInputs(Node* node) {
+  if (node->kind() != ::c10::onnx::Loop) {
+    return;
+  }
+
+  auto* graph = node->owningGraph();
+
+  // add cast to condition input outside the loop.
+  Value* cond_val = node->inputs()[1];
+  if (IsCondCastRequired(cond_val))
+    InsertCastForCond(cond_val, graph, node);
+
+  // Setup Loop input cond and i.
+  TORCH_INTERNAL_ASSERT(node->blocks().size() == 1);
+  auto* sub_block = node->blocks()[0];
+  Value* cond = sub_block->insertInput(1, "cond");
+  cond->setType(BoolType::create());
+
+  Value* i = sub_block->inputs()[0];
+  i->setType(TensorType::fromNumberType(IntType::get()));
+
+  // add cast to condition input inside the loop.
+  Value* next_cond_val = sub_block->outputs()[0];
+  if (IsCondCastRequired(next_cond_val))
+    InsertCastForCond(next_cond_val, graph, sub_block->return_node());
+}
+
 std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
   auto output_size = node->outputs().size();
   FixupONNXLoopNodeInputs(node);
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
index 3487946d721b..e2097ccf7809 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
@@ -5,7 +5,8 @@
 namespace torch {
 namespace jit {
 
+void FixupONNXLoopNodeInputs(Node* node);
 std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version);
 
-}
+} // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index e4965f692a23..07f71fc64fea 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -50,16 +50,6 @@ void buildParamsMapFromValueToParamsMap(
   }
 }
 
-Node* addNodeToBlock(Block* block, Value* input, Symbol kind) {
-  auto new_node = block->appendNode(block->owningGraph()->create(kind));
-  auto new_input = new_node->addInput(input);
-  for (size_t i = 0; i < new_node->outputs().size(); i++) {
-    auto output = new_node->outputs()[i];
-    block->registerOutput(output);
-  }
-  return new_node;
-}
-
 c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
   switch (onnx_type) {
     case ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED:
@@ -94,5 +84,17 @@ c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
   return c10::optional<at::ScalarType>{};
 }
 
+Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs) {
+  auto new_node = block->appendNode(block->owningGraph()->create(kind));
+  for (auto input : inputs) {
+    auto new_input = new_node->addInput(input);
+  }
+  return new_node;
+}
+
+Value* addInputToBlock(Block* block) {
+  return block->addInput();
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index b3ab64fe759a..e27909ff6362 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -27,7 +27,10 @@ void eraseUnusedBlockInputs(Block* b);
 void buildParamsMapFromValueToParamsMap(
     const ValueToParamPairMap& valsToParamsMap,
     ParamMap& paramsDict);
-Node* addNodeToBlock(Block* block, Value* input, Symbol kind);
+
+Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs);
+
+Value* addInputToBlock(Block* block);
 
 TORCH_API c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 6177fbbcc643..e04a74a1905a 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -310,7 +310,13 @@ void pushPackingPastRnn(Block* b) {
       std::vector<int64_t> new_sizes;
       new_sizes.push_back(*oldType->sizes()[0]);
       new_sizes.push_back(*oldType->sizes()[1]);
-      new_sizes.push_back(rnn->i(attr::hidden_size));
+      if (next->kind() == onnx::Reshape) {
+        // bidirection
+        new_sizes.push_back(rnn->i(attr::hidden_size) * 2);
+      } else {
+        // unidirection
+        new_sizes.push_back(rnn->i(attr::hidden_size));
+      }
       TensorTypePtr newType = TensorType::createContiguous(
           *oldType->scalarType(), *oldType->device(), new_sizes);
       next->outputs().at(0)->setType(newType);
@@ -747,6 +753,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
               prim::ListConstruct);
           // make output of reshape the output of nllloss
           nllloss_output->replaceAllUsesWith(origNllLossNode);
+          origNllLossNode->output(0)->copyMetadata(nllloss_output->output(0));
         }
       } else {
         continue;
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index d9ae3ca244fd..d1527ae41411 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -105,15 +105,27 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
   // Loop over fetched slice and select nodes and convert them to index tensors.
   // keep track of which dimension the current slice/select node is applying to.
   int64_t cur_dim = 0;
-  // select does not keep dims,
-  // this creates offset for latter slice and select nodes.
   int64_t dim_offset = 0;
   const auto orig_tensor_indices = index_put_node->input(1)->node()->inputs();
   for (auto it = slice_and_select_nodes.rbegin();
        it != slice_and_select_nodes.rend();
        ++it) {
     auto node = *it;
-    auto dim = node->get(attr::dim)->toInt() + dim_offset;
+    // select does not keep dims,
+    // this creates offset for latter slice and select nodes.
+    auto dim = node->get(attr::dim)->toInt();
+    if (dim < 0) {
+      auto input_type = node->input(0)->type()->expect<TensorType>();
+      if (input_type->dim().has_value()) {
+        auto rank = input_type->dim().value();
+        dim = dim + rank;
+      } else {
+        std::cerr
+            << "Error: ONNX Remove Inplace Ops - Cannot export ellipsis indexing for input "
+            << "of unknown rank.";
+      }
+    }
+    dim = dim + dim_offset;
 
     while (cur_dim < dim) {
       // Handle skipped dims, these are created from ..., or tensor indices
@@ -340,14 +352,23 @@ void PrepareCopyForONNX(Block* block) {
       // Remove aten::copy_, and replace it with index_put.
       // 1. create an empty listConstruct node as indices input for index_put.
       // 2. create index_put node.
+
+      // Tracing aten::copy_ broadcasts the rhs values.
+      // 3. Apply broadcasting for scripting.
       WithInsertPoint guard(node);
       auto graph = node->owningGraph();
       auto dummy_list =
           graph->insertNode(graph->createList(OptionalType::ofTensor(), {}))
               ->output();
+
+      auto expanded_value =
+          graph->insert(aten::expand_as, {node->input(1), node->input(0)});
+      expanded_value->node()->setSourceRange(node->sourceRange());
+      expanded_value->copyMetadata(node->input(1));
+
       auto index_put = graph->insert(
           aten::index_put,
-          {node->input(0), dummy_list, node->input(1), node->input(2)});
+          {node->input(0), dummy_list, expanded_value, node->input(2)});
       index_put->node()->setSourceRange(node->sourceRange());
       index_put->copyMetadata(node->output());
       node->output()->replaceAllUsesWith(index_put);
@@ -452,18 +473,29 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
             << "Warning: ONNX Preprocess - Removing mutation on block inputs. "
             << "This changes graph semantics." << std::endl;
 
-        auto newNode = node->owningGraph()->create(aten::clone, 1);
-        newNode->output()->copyMetadata(input);
-        newNode->addInput(input);
-
-        auto* noneNode = node->owningGraph()->create(prim::Constant);
-        noneNode->output()->setType(NoneType::get());
-        newNode->addInput(noneNode->output());
-
-        newNode->insertBefore(node);
-        noneNode->insertBefore(newNode);
-        node->replaceInput(index, newNode->output());
-        input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        if (input->type()->kind() == TypeKind::ListType) {
+          // Create an aten::list to clone the list in graph inputs
+          auto newNode = node->owningGraph()->create(aten::list, 1);
+          newNode->output()->copyMetadata(input);
+          newNode->addInput(input);
+          newNode->insertBefore(node);
+          node->replaceInput(index, newNode->output());
+          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        } else {
+          // Create an aten::clone to clone the tensor in graph inputs
+          auto newNode = node->owningGraph()->create(aten::clone, 1);
+          newNode->output()->copyMetadata(input);
+          newNode->addInput(input);
+
+          auto* noneNode = node->owningGraph()->create(prim::Constant);
+          noneNode->output()->setType(NoneType::get());
+          newNode->addInput(noneNode->output());
+
+          newNode->insertBefore(node);
+          noneNode->insertBefore(newNode);
+          node->replaceInput(index, newNode->output());
+          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        }
       }
     }
   }
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 60725bf0cf32..ffd225ff82d5 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -40,8 +40,8 @@ TypePtr MergeInferredType(TypePtr existing_type, TypePtr inferred_type) {
       return new_tensor_type;
     }
     auto type = old_tensor_type;
-    if (new_tensor_type->sizes().isComplete()) {
-      type = type->withSizes(new_tensor_type->sizes().concrete_sizes().value());
+    if (new_tensor_type->dim()) {
+      type = type->withSymbolicShapes(new_tensor_type->symbolic_sizes());
     }
     if (new_tensor_type->scalarType().has_value()) {
       type = type->withScalarType(new_tensor_type->scalarType());
@@ -69,7 +69,8 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 
 TensorTypePtr TorchTensorTypeFromONNX(
-    const onnx::TypeProto_Tensor& onnx_tensor_type) {
+    const onnx::TypeProto_Tensor& onnx_tensor_type,
+    const SymbolDimMap& symbol_map) {
   c10::optional<at::ScalarType> scalar_type;
   if (onnx_tensor_type.has_elem_type()) {
     scalar_type = ONNXTypeToATenType(onnx_tensor_type.elem_type());
@@ -82,33 +83,51 @@ TensorTypePtr TorchTensorTypeFromONNX(
       c10::VaryingShape<c10::Stride>{},
       {});
   if (onnx_tensor_type.has_shape()) {
-    std::vector<int64_t> sizes;
+    std::vector<c10::ShapeSymbol> sizes;
     auto onnx_shape = onnx_tensor_type.shape();
 
     for (int i = 0; i < onnx_shape.dim_size(); ++i) {
       auto& dim = onnx_shape.dim(i);
       if (dim.has_dim_value()) {
-        sizes.push_back(dim.dim_value());
+        sizes.emplace_back(c10::ShapeSymbol::fromStaticSize(dim.dim_value()));
       } else {
-        // TODO: handle dim_param?
-        return v_type;
+        GRAPH_UPDATE("Got dim_param:", dim.dim_param());
+        c10::optional<c10::ShapeSymbol> sym = c10::nullopt;
+        for (auto pair : symbol_map) {
+          if (pair.second == dim.dim_param()) {
+            sym = pair.first;
+            break;
+          }
+        }
+        if (!sym) {
+          sym = c10::ShapeSymbol::newSymbol();
+        }
+        sizes.emplace_back(sym.value());
       }
     }
     v_type = TensorType::create(scalar_type, at::kCPU, sizes.size(), {});
-    v_type = v_type->withSizes(sizes);
+    v_type = v_type->withSymbolicShapes(c10::SymbolicShape(sizes));
+
+    if (v_type->sizes().concrete_sizes().has_value()) {
+      // Populate strides based on sizes info, if sizes are all static.
+      // Creating strides ensures yielding True for isCompleteTensor.
+      v_type = v_type->contiguous();
+    }
   }
 
   return v_type;
 }
 
 ListTypePtr TorchListTypeFromONNX(
-    const onnx::TypeProto_Sequence& onnx_sequence_type) {
+    const onnx::TypeProto_Sequence& onnx_sequence_type,
+    SymbolDimMap symbol_map) {
   c10::optional<at::ScalarType> scalar_type;
   if (onnx_sequence_type.has_elem_type()) {
     auto onnx_seq_elem_type = onnx_sequence_type.elem_type();
     if (onnx_seq_elem_type.has_tensor_type()) {
       auto onnx_tensor_type = onnx_seq_elem_type.tensor_type();
-      auto v_tensor_type = TorchTensorTypeFromONNX(onnx_tensor_type);
+      auto v_tensor_type =
+          TorchTensorTypeFromONNX(onnx_tensor_type, symbol_map);
       auto v_type = ListType::create(v_tensor_type);
       return v_type;
     }
@@ -118,21 +137,24 @@ ListTypePtr TorchListTypeFromONNX(
 
 void UpdateTorchValueByOnnxValueInfo(
     Value* v,
-    const onnx::ValueInfoProto& p_info) {
+    const onnx::ValueInfoProto& p_info,
+    SymbolDimMap symbol_map) {
   if (!p_info.has_type()) {
     return;
   }
 
   auto p_type = p_info.type();
   if (p_type.has_tensor_type()) {
-    auto torch_tensor_type = TorchTensorTypeFromONNX(p_type.tensor_type());
+    auto torch_tensor_type =
+        TorchTensorTypeFromONNX(p_type.tensor_type(), symbol_map);
     if (torch_tensor_type) {
-      v->setType(torch_tensor_type);
+      v->setType(MergeInferredType(v->type(), torch_tensor_type));
     }
   } else if (p_type.has_sequence_type()) {
-    auto torch_list_type = TorchListTypeFromONNX(p_type.sequence_type());
+    auto torch_list_type =
+        TorchListTypeFromONNX(p_type.sequence_type(), symbol_map);
     if (torch_list_type) {
-      v->setType(torch_list_type);
+      v->setType(MergeInferredType(v->type(), torch_list_type));
     }
   }
 }
@@ -148,9 +170,17 @@ bool IsSupportedNode(const Node* n) {
   // Skip when block size is zero. This is when the node is first created,
   // doesn't have subblocks attached yet. Run shape inference for these nodes
   // when the subgraph has already completed shape inferencing.
-  if ((node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) &&
-      n->blocks().size() == 0) {
-    return false;
+  if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) {
+    if (n->blocks().size() == 0) {
+      return false;
+    }
+    for (auto b : n->blocks()) {
+      for (auto b_n : b->nodes()) {
+        if (!IsSupportedNode(b_n)) {
+          return false;
+        }
+      }
+    }
   }
 
   return true;
@@ -233,11 +263,11 @@ bool IsGraphValidForInference(std::shared_ptr<Graph> graph) {
 
 void ConvertGraphToONNXProto(
     std::shared_ptr<Graph> graph,
-    onnx::ModelProto& model_proto,
+    std::shared_ptr<onnx::ModelProto>& model_proto,
+    SymbolDimMap& symbol_map,
     int opset_version) {
-  std::string model_str;
   RawDataExportMap export_map;
-  std::tie(model_str, export_map) = export_onnx(
+  std::tie(model_proto, export_map, symbol_map) = export_onnx(
       graph,
       {},
       opset_version,
@@ -250,9 +280,8 @@ void ConvertGraphToONNXProto(
       true,
       false,
       std::string());
-  model_proto.ParseFromString(model_str);
-  for (int i = 0; i < model_proto.graph().output_size(); ++i) {
-    model_proto.mutable_graph()->mutable_output(i)->clear_type();
+  for (int i = 0; i < model_proto->graph().output_size(); ++i) {
+    model_proto->mutable_graph()->mutable_output(i)->clear_type();
   }
 }
 
@@ -284,7 +313,8 @@ void SpecialPostProcess(Node* n) {
 void UpdateOutputTypeByONNXProto(
     Node* n,
     Node* clone_node,
-    const onnx::ModelProto& model_proto) {
+    const onnx::ModelProto& model_proto,
+    SymbolDimMap symbol_map) {
   auto graph_proto = model_proto.graph();
   // inferred shapes are stored in value_info.
   for (size_t i = 0; i < graph_proto.value_info_size(); ++i) {
@@ -292,12 +322,10 @@ void UpdateOutputTypeByONNXProto(
     // get data from value_info and updated original graph.
     for (size_t j = 0; j < clone_node->outputs().size(); ++j) {
       if (clone_node->output(j)->debugName() == v_info.name()) {
-        UpdateTorchValueByOnnxValueInfo(n->output(j), v_info);
+        UpdateTorchValueByOnnxValueInfo(n->output(j), v_info, symbol_map);
       }
     }
   }
-
-  SpecialPostProcess(n);
 }
 
 } // namespace
@@ -322,26 +350,94 @@ void ONNXShapeTypeInference(Node* n, int opset_version) {
   GRAPH_DEBUG(
       "Cloned torch graph to run shape inference: ", n_graph->toString());
 
-  if (!IsGraphValidForInference(n_graph)) {
-    GRAPH_UPDATE("Skipping ONNX shape inference for this node.");
-    return;
+  if (IsGraphValidForInference(n_graph)) {
+    // TODO: Some ops have conversion happen at Peephole pass.
+    //       The conversion here is incomplete for these ops.
+    //       e.g: ListConstruct, ListUnpack, etc.
+    std::shared_ptr<onnx::ModelProto> model_proto;
+    SymbolDimMap symbol_map;
+    ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version);
+    GRAPH_DEBUG(
+        "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
+
+    // infer shape
+    onnx::shape_inference::InferShapes(*model_proto);
+    GRAPH_DEBUG(
+        "ONNX graph after shape inference: ", prettyPrint(*model_proto));
+
+    UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map);
   }
 
-  // TODO: Some ops have conversion happen at Peephole pass.
-  //       The conversion here is incomplete for these ops.
-  //       e.g: ListConstruct, ListUnpack, etc.
-  onnx::ModelProto model_proto;
-  ConvertGraphToONNXProto(n_graph, model_proto, opset_version);
-  GRAPH_DEBUG("ONNX graph to run shape inference: ", prettyPrint(model_proto));
-
-  // infer shape
-  onnx::shape_inference::InferShapes(model_proto);
-  GRAPH_DEBUG("ONNX graph after shape inference: ", prettyPrint(model_proto));
-
-  UpdateOutputTypeByONNXProto(n, clone_node, model_proto);
+  SpecialPostProcess(n);
   GRAPH_DEBUG(
       "Torch graph after shape inference:", n->owningGraph()->toString());
 }
 
+void ONNXSetDynamicInputShape(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    const std::vector<std::string>& input_names) {
+  GRAPH_UPDATE("ONNX set dynamic input shape.");
+  GRAPH_UPDATE("dynamic axes tensor names:", [&]() {
+    std::vector<std::string> res(dynamic_axes.size());
+    std::transform(
+        dynamic_axes.begin(), dynamic_axes.end(), res.begin(), [](auto pair) {
+          return pair.first;
+        });
+    return res;
+  }());
+
+  std::map<std::string, ::c10::ShapeSymbol> name_to_sym;
+
+  for (int i = 0; i < input_names.size(); ++i) {
+    auto input_name = input_names[i];
+    if (dynamic_axes.find(input_name) != dynamic_axes.end()) {
+      auto axes_names = dynamic_axes.find(input_name)->second;
+      TORCH_INTERNAL_ASSERT(i < graph->inputs().size());
+      auto input_tensor_type = graph->inputs()[i]->type()->cast<TensorType>();
+      if (!input_tensor_type) {
+        continue;
+      }
+
+      auto shape = input_tensor_type->symbolic_sizes().sizes().value();
+
+      for (auto pair : axes_names) {
+        auto axis = pair.first;
+        auto name = pair.second;
+        if (name_to_sym.find(name) == name_to_sym.end()) {
+          name_to_sym[name] = ::c10::ShapeSymbol::newSymbol();
+        }
+        shape[axis] = name_to_sym[name];
+      }
+
+      graph->inputs()[i]->setType(
+          input_tensor_type->withSymbolicShapes(::c10::SymbolicShape(shape)));
+    }
+  }
+}
+
+void ONNXAssignOutputShape(
+    std::shared_ptr<Graph>& graph,
+    at::ArrayRef<at::Tensor> outputs,
+    bool onnx_shape_inference) {
+  TORCH_INTERNAL_ASSERT(graph->outputs().size() == outputs.size());
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    if (onnx_shape_inference) {
+      graph->outputs()[i]->setType(MergeInferredType(
+          TensorType::create(outputs[i]), graph->outputs()[i]->type()));
+    } else {
+      graph->outputs()[i]->inferTypeFrom(outputs[i]);
+    }
+  }
+}
+
+void ONNXShapeTypeInference(std::shared_ptr<Graph>& graph, int opset_version) {
+  for (auto n : graph->nodes()) {
+    ONNXShapeTypeInference(n, opset_version);
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index d373f3a06d92..79e7c06045ea 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -8,11 +8,39 @@ namespace jit {
 TORCH_API TypePtr
 MergeInferredType(TypePtr existing_type, TypePtr inferred_type);
 
+// Update graph input types with dynamic axes info.
+// Axes that are marked as dynamic will be assigned as dynamic ShapeSymbol.
+// Note it is possible for multiple axes to share the same ShapeSymbol,
+// if they are defined as such in dynamic_axes.
+TORCH_API void ONNXSetDynamicInputShape(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    const std::vector<std::string>& input_names);
+
+// Update graph output with types of output Tensors.
+// If onnx_shape_inference is true, types of output Tensors will be compared and
+// merged with inferred types. It is possible that inferred types contain
+// dynamic axes, hence it takes precedence over types of output Tensors.
+TORCH_API void ONNXAssignOutputShape(
+    std::shared_ptr<Graph>& graph,
+    at::ArrayRef<at::Tensor> outputs,
+    bool onnx_shape_inference);
+
 // Utilize ONNX Shape Inference for node.
 // The node must have ONNX namespace, and is valid ONNX node accroding to spec.
 // On successful ONNX shape inference runs, the function updates output types of
 // n with inferred shape and type. Otherwise n is unchanged.
 TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version);
 
+// Utilize ONNX Shape Inference for graph.
+// Internally calls ONNXShapeTypeInference for each node, to achieve more
+// coverage that skips only individual nodes if illegal, instead of skipping for
+// the entire graph.
+TORCH_API void ONNXShapeTypeInference(
+    std::shared_ptr<Graph>& g,
+    int opset_version);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp
index 3d0d9a6eff6c..0f12e94180a5 100644
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/quantization/finalize.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/prepack_folding.h>
 #include <torch/csrc/jit/passes/quantization/quantization_patterns.h>
@@ -65,20 +66,36 @@ void InsertPrepackUnpack(Module& module) {
 void FoldQuantizedPrepackingOps(Module& module) {
   auto filter_fn = [](const Node* n) -> bool {
     return (
-        (n->kind() == Symbol::fromQualString("quantized::linear_prepack")) ||
+        n->kind() == Symbol::fromQualString("quantized::linear_prepack") ||
         n->kind() == Symbol::fromQualString("quantized::conv1d_prepack") ||
         n->kind() == Symbol::fromQualString("quantized::conv2d_prepack") ||
-        n->kind() == Symbol::fromQualString("quantized::conv3d_prepack"));
+        n->kind() == Symbol::fromQualString("quantized::conv3d_prepack") ||
+        n->kind() ==
+            Symbol::fromQualString("quantized::conv_transpose1d_prepack") ||
+        n->kind() ==
+            Symbol::fromQualString("quantized::conv_transpose2d_prepack"));
   };
   PrePackingOpsFolder(module, filter_fn, "quantized");
 }
 
-Module Finalize(Module& module, QuantType quant_type) {
+Module Finalize(
+    Module& module,
+    QuantType quant_type,
+    const std::vector<std::string>& preserved_attrs) {
+  // Tracing annotates the resulting graph with shape information. In many case,
+  // user applies different input shapes to traced graph. It is on the user to
+  // know it is correct to do so. The quantized module needs to be clean up and
+  // To prevent the JIT optimizations from leveraging the annotated shape info,
+  // clear shape information in the graph.
+  for (auto func : module.type()->methods()) {
+    ClearProfilingInformation(func->graph());
+  }
+
   auto graph = module.get_method("forward").graph();
   InsertPrepackUnpack(graph);
   GRAPH_DUMP("Before QuantFusion:", graph);
   QuantFusion(graph, quant_type);
-  auto frozen = freeze_module(module);
+  auto frozen = freeze_module(module, preserved_attrs);
   FoldQuantizedPrepackingOps(frozen);
   return frozen;
 }
diff --git a/torch/csrc/jit/passes/quantization/finalize.h b/torch/csrc/jit/passes/quantization/finalize.h
index 1de65dcb20e4..062d1e24251e 100644
--- a/torch/csrc/jit/passes/quantization/finalize.h
+++ b/torch/csrc/jit/passes/quantization/finalize.h
@@ -49,7 +49,9 @@ TORCH_API void InsertPrepackUnpack(Module& module);
 
 TORCH_API script::Module Finalize(
     script::Module& module,
-    QuantType quant_type = QuantType::STATIC);
+    QuantType quant_type = QuantType::STATIC,
+    const std::vector<std::string>& preserved_attrs =
+        std::vector<std::string>());
 
 TORCH_API void FoldQuantizedPrepackingOps(Module& module);
 
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index ddaf150803fe..10e46aee644c 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -32,6 +32,8 @@ std::vector<std::string> _static_quantizable_aten_funcs = {
     "conv1d",
     "conv2d",
     "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
     "linear",
     "hardswish",
     "hardswish_",
@@ -273,6 +275,8 @@ bool isWeight(Value* v) {
       AtenFuncArgs({{"conv1d", 1},
                     {"conv2d", 1},
                     {"conv3d", 1},
+                    {"conv_transpose1d", 1},
+                    {"conv_transpose2d", 1},
                     {"linear", 1},
                     {"embedding_bag", 0}}),
       // embedding_bag - prim::CallFunction(%func, %input.1, %weight,
@@ -285,8 +289,12 @@ bool isWeight(Value* v) {
 bool isBiasOfConvOrLinear(Value* v) {
   bool result = matchArgPattern(
       v,
-      AtenFuncArgs(
-          {{"conv1d", 2}, {"conv2d", 2}, {"conv3d", 2}, {"linear", 2}}),
+      AtenFuncArgs({{"conv1d", 2},
+                    {"conv2d", 2},
+                    {"conv3d", 2},
+                    {"conv_transpose1d", 2},
+                    {"conv_transpose2d", 2},
+                    {"linear", 2}}),
       CallFuncArgs({{"linear", 3}}));
   return result;
 }
@@ -728,6 +736,20 @@ bool is_conv3d_module(
       match, vmap, "conv", "__torch__.torch.nn.modules.conv.Conv3d");
 }
 
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  return is_module(
+      match, vmap, "conv", "__torch__.torch.nn.modules.conv.ConvTranspose1d");
+}
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  return is_module(
+      match, vmap, "conv", "__torch__.torch.nn.modules.conv.ConvTranspose2d");
+}
+
 bool is_batchnorm2d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap) {
diff --git a/torch/csrc/jit/passes/quantization/helper.h b/torch/csrc/jit/passes/quantization/helper.h
index 440134ccbd3f..f473b4b7caa8 100644
--- a/torch/csrc/jit/passes/quantization/helper.h
+++ b/torch/csrc/jit/passes/quantization/helper.h
@@ -194,6 +194,14 @@ bool is_conv3d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap);
 
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
 bool is_batchnorm2d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap);
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index ce6d5cbc23ff..fd49cf672e00 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -291,9 +291,13 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
   auto observer_out = observer->output();
 
   std::string prepack_fn, quant_fn;
+  std::vector<Value*> prepack_inputs = {observer_out};
   if (op_name == "embedding_bag_4bit") {
+    bool optimized_qparams = false;
+    Value* optimized_qparams_false = g->insertConstant(optimized_qparams);
     prepack_fn = "quantized::embedding_bag_4bit_prepack";
     quant_fn = "quantized::embedding_bag_4bit_rowwise_offsets";
+    prepack_inputs.push_back(optimized_qparams_false);
   } else if (op_name == "embedding_bag_byte") {
     prepack_fn = "quantized::embedding_bag_byte_prepack";
     quant_fn = "quantized::embedding_bag_byte_rowwise_offsets";
@@ -302,7 +306,6 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
         "Graph Mode Quantization currently supports 4-bit and 8-bit embedding bag quantization.");
   }
 
-  std::vector<Value*> prepack_inputs = {observer_out};
   std::vector<Use> uses = observer_out->uses();
   Node* embedding_bag_float_op;
   // We expect that the output of the weight observer will be consumed by the
@@ -985,8 +988,14 @@ std::tuple<c10::QScheme, QParamVector> InsertQuantDeQuantHelper::
 
   auto observer_module = module.attr(observer_name.value()).toModule();
   auto scalar_type = observer_module.attr("dtype");
-  if (isPlaceholderObserver(n->input(0)) ||
-      scalar_type == at::ScalarType::Half) {
+  if (isPlaceholderObserver(n->input(0))) {
+    // get compute_dtype for dynamic quantization
+    if (observer_module.hasattr("compute_dtype")) {
+      qparams.push_back(std::make_pair(
+          "_scalar_type", observer_module.attr("compute_dtype")));
+    }
+    return std::make_tuple(qscheme, qparams);
+  } else if (scalar_type == at::ScalarType::Half) {
     return std::make_tuple(qscheme, qparams);
   }
   auto calculate_qparams = observer_module.get_method("calculate_qparams");
diff --git a/torch/csrc/jit/passes/quantization/quantization_patterns.h b/torch/csrc/jit/passes/quantization/quantization_patterns.h
index ba692d88c18c..8248357d986e 100644
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -407,6 +407,38 @@ graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %pad
         %r_quant = quantized::conv3d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
         return (%r_quant) )";
 
+  // aten::conv_transpose1d
+  std::string conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose2d
+  std::string conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
   std::string add_relu = R"(
 graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
          %a_dequant = aten::dequantize(%a_quant)
@@ -907,6 +939,12 @@ graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype)
       {"quantized::conv3d", conv3d, quantized_conv3d},
       {"quantized::conv3d_relu", conv3d_relu, quantized_conv3d_relu},
       {"quantized::conv3d_relu", conv3d_inplace_relu, quantized_conv3d_relu},
+      {"quantized::conv_transpose1d",
+       conv_transpose1d,
+       quantized_conv_transpose1d},
+      {"quantized::conv_transpose2d",
+       conv_transpose2d,
+       quantized_conv_transpose2d},
       {"quantized::linear", linear, quantized_linear},
       {"quantized::linear_relu", linear_relu, quantized_linear_relu},
       {"quantized::linear_relu", linear_inplace_relu, quantized_linear_relu},
@@ -1128,12 +1166,44 @@ graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
         %r = aten::conv3d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
+  std::string conv_transpose1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose1d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose2d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
   return {
       {"conv1d_prepack_unpack", conv1d_with_quant, conv1d_with_quant_prepack},
       {"conv2d_prepack_unpack", conv2d_with_quant, conv2d_with_quant_prepack},
-      {"conv3d_prepack_unpack", conv3d_with_quant, conv3d_with_quant_prepack}
-
-  };
+      {"conv3d_prepack_unpack", conv3d_with_quant, conv3d_with_quant_prepack},
+      {"conv_transpose1d_prepack_unpack",
+       conv_transpose1d_with_quant,
+       conv_transpose1d_with_quant_prepack},
+      {"conv_transpose2d_prepack_unpack",
+       conv_transpose2d_with_quant,
+       conv_transpose2d_with_quant_prepack}};
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.cpp b/torch/csrc/jit/passes/remove_inplace_ops.cpp
index 5d7ee3a2c5cc..fb4445371c74 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.cpp
+++ b/torch/csrc/jit/passes/remove_inplace_ops.cpp
@@ -8,6 +8,7 @@ static const std::unordered_map<NodeKind, NodeKind> inPlaceToOutOfPlace = {
     {aten::sub_, aten::sub},
     {aten::div_, aten::div},
     {aten::mul_, aten::mul},
+    {aten::masked_fill_, aten::masked_fill},
     {aten::zero_, aten::zeros_like},
     {aten::fill_, aten::full_like}};
 
diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp
index ad1fb36da5de..2fc95ae72339 100644
--- a/torch/csrc/jit/passes/specialize_autogradzero.cpp
+++ b/torch/csrc/jit/passes/specialize_autogradzero.cpp
@@ -117,6 +117,8 @@ struct AutogradZeroSpecializer {
     WithInsertPoint wip{graph_->block()->param_node()->next()};
     Value* none_val = graph_->insertConstant(IValue());
     std::vector<Value*> checks;
+    std::vector<Value*> zero_values;
+    std::vector<Value*> nonzero_values;
 
     for (auto inp : graph_->inputs()) {
       if (auto profile_optional_node = getUse(inp, prim::profile_optional)) {
@@ -146,15 +148,16 @@ struct AutogradZeroSpecializer {
       }
 
       state_[inp] = *pttp->undefined() ? State::Zero : State::Nonzero;
-      auto check = graph_->insert(prim::AutogradAnyNonZero, {inp});
+
       if (*pttp->undefined()) {
-        check = graph_->insert(aten::__not__, {check});
+        zero_values.push_back(inp);
+      } else {
+        nonzero_values.push_back(inp);
       }
-      checks.push_back(check);
     }
 
     // unable to specialize any of the inputs
-    if (checks.size() == 0) {
+    if (nonzero_values.size() == 0 && zero_values.size() == 0) {
       GRAPH_DUMP("Unable to add any specialization guards", graph_);
       versioning_if->destroy();
       // the checks we inserted will be cleaned up
@@ -162,6 +165,18 @@ struct AutogradZeroSpecializer {
       return nullptr;
     }
 
+    Node* nonzero_check = graph_->insert(prim::AutogradAllNonZero, {})->node();
+    for (Value* v : nonzero_values) {
+      nonzero_check->addInput(v);
+    }
+    checks.push_back(nonzero_check->output());
+
+    Node* zero_check = graph_->insert(prim::AutogradAllZero, {})->node();
+    for (Value* v : zero_values) {
+      zero_check->addInput(v);
+    }
+    checks.push_back(zero_check->output());
+
     Value* bool_list =
         graph_->insertNode(graph_->createList(BoolType::get(), checks))
             ->output();
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 02b4861eabfe..fd70f2963b8b 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -29,6 +29,23 @@ bool isSupportedForBlock(Node* node) {
   }
 }
 
+bool usedOnlyInSize(Value* v) {
+  const auto& uses = v->uses();
+  return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
+    return u.user->matches("aten::size(Tensor self) -> int[]");
+  });
+}
+
+Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
+  AT_ASSERT(!sizes.empty());
+  Graph* graph = sizes[0]->owningGraph();
+  Node* broadcast_n =
+      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
+  broadcast_n->output()->setType(ListType::ofInts());
+  db->createValue(broadcast_n->output());
+  return broadcast_n->output();
+}
+
 namespace tensorexpr {
 bool isSupported(Node* node) {
   // For Block codegen we allow limited ops.
@@ -166,6 +183,7 @@ bool isSupported(Node* node) {
   switch (node->kind()) {
     case prim::ConstantChunk:
     case prim::ListConstruct:
+    case prim::TensorExprGroup:
       return true;
   }
 
@@ -174,7 +192,7 @@ bool isSupported(Node* node) {
 
 } // namespace tensorexpr
 
-static bool texpr_fuser_enabled_ = false;
+static bool texpr_fuser_enabled_ = true;
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
@@ -201,12 +219,6 @@ bool texprReductionsEnabled() {
   return texpr_reductions_enabled;
 }
 
-struct nodesComparator {
-  bool operator()(Node* a, Node* b) const {
-    return a->isAfter(b);
-  }
-};
-
 // TODO: if a value has differently typed uses, temporarrily insert a node
 // specializing the type for each use and later remove, instead of bailing
 bool profiledWithDifferentTypes(Value* v) {
@@ -284,8 +296,139 @@ void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph) {
 
 class TensorExprFuser {
  public:
-  TensorExprFuser(std::shared_ptr<Graph> graph, size_t min_group_size)
-      : graph_(std::move(graph)), min_group_size_(min_group_size) {}
+  TensorExprFuser(
+      std::shared_ptr<Graph> graph,
+      size_t min_group_size,
+      bool disable_shape_checks)
+      : graph_(std::move(graph)),
+        min_group_size_(min_group_size),
+        disable_shape_checks_(disable_shape_checks) {}
+
+  // Builds up expressions that compute shapes of all intermediates (and
+  // outputs) of the fusion group, based on the sizes of inputs. You should run
+  // DCE to remove those that you end up not using.
+  std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
+    GRAPH_DUMP("buildShapeExpressions for ", fusion_group->g(attr::Subgraph));
+    WithInsertPoint insert_guard{fusion_group->next()};
+    std::unordered_map<Value*, Value*> shape_of;
+
+    Graph* graph = fusion_group->owningGraph();
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto inputs = fusion_group->inputs();
+    auto sinputs = subgraph->inputs();
+    AT_ASSERT(inputs.size() == sinputs.size());
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->type()->isSubtypeOf(TensorType::get())) {
+        Value* soutput = graph->insert(aten::size, {inputs[i]});
+        aliasDb_->createValue(soutput);
+        GRAPH_DEBUG(
+            "Adding a mapping for %",
+            sinputs[i]->debugName(),
+            " ",
+            getHeader(soutput->node()));
+        shape_of[sinputs[i]] = soutput;
+      }
+    }
+
+    // When we have a guarantee that an output won't be removed, because it's
+    // used in expressions that don't involve size checks, we can use its size
+    // instead of computing a long chain of broadcasts, starting from the
+    // beginning of the kernel.
+    auto outputs = fusion_group->outputs();
+    auto soutputs = subgraph->outputs();
+    AT_ASSERT(outputs.size() == soutputs.size());
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      if (usedOnlyInSize(outputs[i]))
+        continue;
+      Value* soutput = graph->insert(aten::size, {outputs[i]});
+      aliasDb_->createValue(soutput);
+      shape_of[soutputs[i]] = soutput;
+    }
+
+    for (Node* n : subgraph->nodes()) {
+      // XXX: Use of shape_of.emplace is crucial to the output shape
+      // optimization!
+      if (n->kind() == aten::cat) {
+        // This is a bit more involved, because we have to account for the case
+        // when inputs have different shapes, but fortunately those tensors are
+        // always outputs, and so we can simply avoid replacing their queries,
+        // because it won't help us.
+        continue;
+      }
+      if (n->kind() == prim::Constant) {
+        continue;
+      }
+      if (n->kind() == prim::ConstantChunk) {
+        Node* sizes_node = graph->insertNode(
+            graph->create(prim::ChunkSizes, shape_of.at(n->input()), 2));
+        sizes_node->i_(attr::dim, n->i(attr::dim));
+        sizes_node->i_(attr::chunks, n->i(attr::chunks));
+        for (Value* output : sizes_node->outputs()) {
+          aliasDb_->createValue(output);
+        }
+        Value* regular_size = sizes_node->outputs().at(0);
+        Value* last_size = sizes_node->outputs().at(1);
+        regular_size->setType(ListType::ofInts());
+        last_size->setType(ListType::ofInts());
+        auto outputs = n->outputs();
+        for (Value* o : outputs.slice(0, outputs.size() - 1)) {
+          shape_of.emplace(o, regular_size);
+        }
+        shape_of.emplace(outputs.at(outputs.size() - 1), last_size);
+        continue;
+      }
+      auto tensor_inputs = filter(n->inputs(), [](Value* v) {
+        return v->type()->isSubtypeOf(TensorType::get());
+      });
+      GRAPH_DEBUG("Building sizes for ", getHeader(n));
+      bool all_inputs_have_sizes = true;
+      auto shapes = fmap(tensor_inputs, [&](Value* v) {
+        GRAPH_DEBUG("Getting aten::size for %", v->debugName());
+        all_inputs_have_sizes &= shape_of.count(v);
+        return shape_of.count(v) != 0 ? shape_of.at(v) : nullptr;
+      });
+
+      if (!all_inputs_have_sizes) {
+        GRAPH_DEBUG(
+            "Not all tensor arguments have sizes available to compute the broadcasted size",
+            getHeader(n));
+        continue;
+      }
+      shape_of.emplace(
+          n->output(),
+          shapes.size() == 1 ? shapes[0]
+                             : broadcastSizes(shapes, aliasDb_.get()));
+    }
+    return shape_of;
+  }
+
+  void removeOutputsUsedOnlyInSize(Node* fusion_group) {
+    if (fusion_group->kind() != prim::TensorExprGroup)
+      return;
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto shape_of = buildShapeExpressions(fusion_group);
+    auto outputs = fusion_group->outputs().vec();
+    auto soutputs = subgraph->outputs().vec();
+    // XXX: Iterating in this order is not only good for performance reasons!
+    // It is also crucial for correctness (i has to reflect the current true
+    // index of outputs[i])!
+    for (int64_t i = static_cast<int64_t>(outputs.size()) - 1; i >= 0; --i) {
+      auto output = outputs[i];
+      auto soutput = soutputs[i];
+      if (usedOnlyInSize(output) && shape_of.count(soutput) > 0) {
+        auto uses = output->uses();
+        for (Use u : uses) {
+          AT_ASSERT(u.user->matches("aten::size(Tensor self) -> int[]"));
+          u.user->output()->replaceAllUsesWith(shape_of.at(soutput));
+          u.user->destroy();
+        }
+        fusion_group->eraseOutput(i);
+        subgraph->eraseOutput(i);
+      }
+    }
+  }
 
   void run() {
     aliasDb_ = torch::make_unique<AliasDb>(graph_);
@@ -293,128 +436,65 @@ class TensorExprFuser {
     GRAPH_DUMP("After removing redundant profile nodes: ", graph_);
     createFusionGroups(graph_->block());
     GRAPH_DUMP("After creating fusion groups: ", graph_);
-    guardFusionGroups(graph_->block());
+    // we maintain alias db correctness during initial fusion, but it is
+    // difficult to maintain correctness after inlining so inline only after
+    // fusion is done.
+    inlineSmallFusionGroups(graph_->block());
+    GRAPH_DUMP("After inlining small fusion groups: ", graph_);
+    guardFusionGroupsAndRemoveOutputs(graph_->block());
     GRAPH_DUMP("After guarding fusion groups: ", graph_);
     removeTensorTypeSpecializations(graph_->block());
     GRAPH_DUMP("After removing tensor type specializations: ", graph_);
   }
 
  private:
-  // Merges `to_merge` into a subgraph by executing merge_fn.
-  // merge_fn takes in map that will be filled with the mapping b/w
-  // to_merge's outputs and the corresponding values in the subgraph.
-  // merge_fn returns the merged-into subgraph
-  Node* aliasingSafeSubgraphMerge(
-      Node* to_merge,
-      const std::function<Node*(std::unordered_map<Value*, Value*>&)>&
-          merge_fn) {
-    // When we merge a node into a subgraph, the new subgraph outputs
-    // have the same aliasing properties as the original node's outputs.
-    // Here we create a placeholder node, transfer the aliasing properties
-    // to the placeholder, execute the merge, and transfer the aliasing
-    // properties to the appropriate fusion group outputs
-    Node* placeholder_node =
-        graph_->insertNode(graph_->create(prim::Uninitialized, 0));
-    std::vector<Value*> existing_values;
-    for (size_t i = 0; i < to_merge->outputs().size(); ++i) {
-      Value* existing = to_merge->outputs().at(i);
-      Value* new_value = placeholder_node->insertOutput(i)->copyMetadata(
-          to_merge->outputs().at(i));
-      aliasDb_->replaceWithNewValue(existing, new_value);
-      existing_values.push_back(existing);
-    }
-    std::unordered_map<Value*, Value*> vmap;
-    Node* fusion_group = merge_fn(vmap);
-    for (size_t i = 0; i < existing_values.size(); ++i) {
-      TORCH_INTERNAL_ASSERT(vmap.count(existing_values.at(i)));
-      Value* subgraph_value = vmap[existing_values.at(i)];
-      auto subgraph = SubgraphUtils::getSubgraph(fusion_group);
-      size_t subgraph_output_index = 0;
-      for (; subgraph_output_index < subgraph->outputs().size();
-           ++subgraph_output_index) {
-        if (subgraph->outputs().at(subgraph_output_index) == subgraph_value) {
-          break;
-        }
-      }
-      if (subgraph_output_index != subgraph->outputs().size()) {
-        aliasDb_->replaceWithNewValue(
-            placeholder_node->outputs().at(i),
-            fusion_group->outputs().at(subgraph_output_index));
-      }
-    }
-    placeholder_node->destroy();
-    return fusion_group;
-  }
-
   Node* getOrCreateTensorExprSubgraph(Node* n) {
     if (n->hasAttribute(attr::Subgraph) && n->kind() == prim::TensorExprGroup) {
       return n;
     }
     GRAPH_UPDATE("Creating a tensorexpr::Group node from: ", *n);
-    return aliasingSafeSubgraphMerge(
-        n, [&](std::unordered_map<Value*, Value*>& vmap) {
-          return SubgraphUtils::createSingletonSubgraph(
-              n, prim::TensorExprGroup, vmap);
-        });
-  }
-
-  void mergeNodeIntoSubgraphAndUpdateAliasing(Node* n, Node* subgraph) {
-    aliasingSafeSubgraphMerge(n, [&](std::unordered_map<Value*, Value*>& vmap) {
-      SubgraphUtils::mergeNodeIntoSubgraph(n, subgraph, vmap);
-      return subgraph;
-    });
+    return SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
+        n, prim::TensorExprGroup, *aliasDb_);
   }
 
-  // Add unvisited input nodes to the queue for further merging into the fusion
-  // group.
-  void updateQueue(
-      Node* fusion_group,
-      std::set<Node*, nodesComparator>& queue,
-      const std::unordered_set<Node*>& visited) {
-    for (auto input : fusion_group->inputs()) {
-      if (!visited.count(input->node())) {
-        queue.insert(input->node());
+  value_list sortReverseTopological(ArrayRef<Value*> inputs, Block* b) {
+    value_list result;
+    for (auto i : inputs) {
+      if (i->node()->owningBlock() == b) {
+        result.push_back(i);
       }
     }
+    // Sort in reverse topological order
+    std::sort(result.begin(), result.end(), [&](Value* a, Value* b) {
+      return a->node()->isAfter(b->node());
+    });
+    return result;
   }
 
   // Create a fusion group starting from the node N.
   // We then try to pull inputs into the fusion group and repeat that process
   // until there is nothing we can pull in.
-  Node* createFusionGroup(Node* n) {
-    // Queue of the nodes we should consider for merging into the fusion groups
-    // (those nodes are usually inputs of the fusion group).
-    // We use an ordered set here to visit them in the right order: the fusion
-    // group is closer to the end of the block and we are trying to pull later
-    // nodes first.
-    // NB: the order in the list in theory could stale if we move nodes around.
-    // However, this should only happen to the nodes we could not fuse, and
-    // hence it should not be a problem.
-    std::set<Node*, nodesComparator> queue;
-    std::unordered_set<Node*> visited_nodes;
-
-    Node* fusion_group = n;
+  std::pair<graph_node_list::iterator, bool> createFusionGroup(
+      Node* fusion_node) {
     if (min_group_size_ == 1) {
-      fusion_group = getOrCreateTensorExprSubgraph(n);
+      fusion_node = getOrCreateTensorExprSubgraph(fusion_node);
     }
 
-    updateQueue(fusion_group, queue, visited_nodes);
-
     GRAPH_DEBUG("Iteratively pull input nodes into the fusion group...\n");
-    while (!queue.empty()) {
-      debugDumpFusionGroup("Current fusion group: ", fusion_group);
-      GRAPH_DEBUG(queue.size(), " nodes are in the queue.\n");
-
-      Node* input_node = *queue.begin();
-      queue.erase(queue.begin());
-
-      GRAPH_DEBUG("Trying to merge: ", *input_node);
-      fusion_group = tryMerge(fusion_group, input_node);
-      visited_nodes.insert(input_node);
-      updateQueue(fusion_group, queue, visited_nodes);
+    auto inputs = sortReverseTopological(
+        fusion_node->inputs(), fusion_node->owningBlock());
+    for (auto input : inputs) {
+      debugDumpFusionGroup("Current fusion group: ", fusion_node);
+      GRAPH_DEBUG("Trying to merge: ", *input->node());
+      if (auto maybe_fusion_group = tryMerge(fusion_node, input->node())) {
+        // we successfully merged, so the new group's `inputs` may have
+        // changed. So rescan the new group for more merging opportunities.
+        return std::make_pair(
+            maybe_fusion_group.value()->reverseIterator(), true);
+      }
     }
 
-    return fusion_group;
+    return std::make_pair(++fusion_node->reverseIterator(), false);
   }
 
   static void debugDumpFusionGroup(const std::string& msg, Node* n) {
@@ -424,69 +504,75 @@ class TensorExprFuser {
     }
   }
 
+  std::pair<graph_node_list::iterator, bool> scanNode(Node* n) {
+    GRAPH_DEBUG("Considering node:", *n)
+
+    if (!canHandle(n)) {
+      return std::make_pair(++n->reverseIterator(), false);
+    }
+    // There are some nodes that we can support, but we don't want to start a
+    // fusion group from - skip them.
+    if (n->kind() == prim::ListConstruct || n->kind() == aten::slice ||
+        n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk ||
+        n->kind() == prim::Constant) {
+      return std::make_pair(++n->reverseIterator(), false);
+    }
+    return createFusionGroup(n);
+  }
+
   // Merge fusible nodes into subgraphs in prim::TensorExprGroup nodes.
   void createFusionGroups(Block* block) {
-    std::vector<Node*> fusion_groups;
-    auto reverse_iter = block->nodes().reverse();
-    Node* prev_fusion_group = nullptr;
-    for (auto it = reverse_iter.begin(); it != reverse_iter.end();) {
-      Node* n = *it;
-      GRAPH_DEBUG("Considering node:", *n)
+    bool any_changed = true;
+    while (any_changed) {
+      any_changed = false;
+      for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+        bool changed;
+        std::tie(it, changed) = scanNode(*it);
+        any_changed |= changed;
+      }
+    }
 
+    for (Node* n : block->nodes()) {
       for (Block* b : n->blocks()) {
         createFusionGroups(b);
       }
+    }
 
-      if (!canHandle(n)) {
-        it++;
-        continue;
-      }
-      // There are some nodes that we can support, but we don't want to start a
-      // fusion group from - skip them.
-      if (n->kind() == prim::ListConstruct || n->kind() == aten::slice ||
-          n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk ||
-          n->kind() == prim::Constant) {
-        it++;
-        continue;
+    // Try to merge adjacent fusion groups together. Because we have only merged
+    // by looking at graph inputs, without this we would not attempt to merge
+    // adjacent fusion groups that don't have a depdency on each other
+
+    std::vector<Node*> initial_fusion_groups;
+    for (Node* n : block->nodes()) {
+      if (n->kind() == prim::TensorExprGroup) {
+        initial_fusion_groups.push_back(n);
       }
+    }
 
-      Node* fusion_group = createFusionGroup(n);
-      debugDumpFusionGroup("Fusion group constructed: ", fusion_group);
+    Node* prev_fusion_group =
+        initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr;
 
+    for (size_t i = 1; i < initial_fusion_groups.size(); ++i) {
       // Try merging the just created fusion group into the previous one.
       // If it did not work, then put the previous fusion group into
       // fusion_groups vector - we will not touch it anymore in this loop.
       // If merging suceeded, save the merged group as the "previous" fusion
       // group so that we can try to merge the next one into it.
-      if (prev_fusion_group) {
+
+      Node* fusion_group = initial_fusion_groups[i];
+      debugDumpFusionGroup(
+          "Trying to merge into the previous fusion group: ",
+          prev_fusion_group);
+      if (auto merged_fusion_group =
+              tryMerge(prev_fusion_group, fusion_group)) {
+        prev_fusion_group = *merged_fusion_group;
         debugDumpFusionGroup(
-            "Trying to merge into the previous fusion group: ",
+            "Successfully merged into the previous fusion group: ",
             prev_fusion_group);
-        if (canMerge(prev_fusion_group, fusion_group)) {
-          prev_fusion_group = tryMerge(prev_fusion_group, fusion_group);
-          debugDumpFusionGroup(
-              "Successfully merged into the previous fusion group: ",
-              prev_fusion_group);
-        } else {
-          GRAPH_DEBUG("Cannot merge into the previous fusion group");
-          fusion_groups.push_back(prev_fusion_group);
-          prev_fusion_group = fusion_group;
-        }
       } else {
+        GRAPH_DEBUG("Cannot merge into the previous fusion group");
         prev_fusion_group = fusion_group;
       }
-      it = prev_fusion_group->reverseIterator();
-      it++;
-    }
-
-    // We were adding groups into the vector lagging by one - catch up with
-    // adding the last one
-    if (prev_fusion_group) {
-      fusion_groups.push_back(prev_fusion_group);
-    }
-
-    for (Node* n : fusion_groups) {
-      inlineIfTooSmall(n);
     }
   }
 
@@ -522,9 +608,21 @@ class TensorExprFuser {
     return false;
   }
 
-  Node* tryMerge(Node* fusion_group, Node* to_merge) {
+  void inlineSmallFusionGroups(Block* block) {
+    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+      Node* n = *it;
+      it++;
+
+      for (Block* b : n->blocks()) {
+        inlineSmallFusionGroups(b);
+      }
+      inlineIfTooSmall(n);
+    }
+  }
+
+  c10::optional<Node*> tryMerge(Node* fusion_group, Node* to_merge) {
     if (!canMerge(fusion_group, to_merge)) {
-      return fusion_group;
+      return c10::nullopt;
     }
 
     std::vector<Node*> nodes_to_merge = {to_merge};
@@ -541,7 +639,7 @@ class TensorExprFuser {
       GRAPH_UPDATE("Trying to move node next to fusion group: ", getHeader(n));
       if (!aliasDb_->moveBeforeTopologicallyValid(n, move_point)) {
         GRAPH_UPDATE("Failed to move because of AliasDB checks!");
-        return fusion_group;
+        return c10::nullopt;
       }
       move_point = n;
     }
@@ -552,7 +650,8 @@ class TensorExprFuser {
 
     for (auto n : nodes_to_merge) {
       GRAPH_UPDATE("Merging ", getHeader(n));
-      mergeNodeIntoSubgraphAndUpdateAliasing(n, fusion_group);
+      SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing(
+          n, fusion_group, *aliasDb_);
     }
     return fusion_group;
   }
@@ -606,7 +705,7 @@ class TensorExprFuser {
 
   bool canHandle(Node* node) {
     REQ(node->kind() != prim::Constant);
-    REQ(allShapesAreKnown(node));
+    REQ(disable_shape_checks_ || allShapesAreKnown(node));
     REQ(isFusableOnDevice(node));
 
     // Don't include nodes whose inputs are tensor constants - we cannot handle
@@ -816,17 +915,18 @@ class TensorExprFuser {
     }
   }
 
-  void guardFusionGroups(Block* block) {
+  void guardFusionGroupsAndRemoveOutputs(Block* block) {
     std::vector<Node*> fusion_groups;
     for (Node* n : block->nodes()) {
       for (Block* b : n->blocks()) {
-        guardFusionGroups(b);
+        guardFusionGroupsAndRemoveOutputs(b);
       }
       if (n->kind() == prim::TensorExprGroup) {
         fusion_groups.push_back(n);
       }
     }
     for (Node* fusion_group : fusion_groups) {
+      removeOutputsUsedOnlyInSize(fusion_group);
       guardFusionGroup(fusion_group);
     }
   }
@@ -836,9 +936,14 @@ class TensorExprFuser {
 
   // Minimal size of a fusion group
   size_t min_group_size_;
+  // If true, shapes are ignored
+  bool disable_shape_checks_;
 };
 
-void FuseTensorExprs(std::shared_ptr<Graph>& graph, size_t min_group_size) {
+void FuseTensorExprs(
+    std::shared_ptr<Graph>& graph,
+    size_t min_group_size,
+    bool disable_shape_checks) {
   GRAPH_DUMP("Before TExprFuser: ", graph);
 
   // Temporary change for Block code generation.
@@ -849,7 +954,7 @@ void FuseTensorExprs(std::shared_ptr<Graph>& graph, size_t min_group_size) {
   // Get rid of dead code so that we don't waste effort fusing it.
   EliminateDeadCode(graph);
 
-  TensorExprFuser fuser(graph, min_group_size);
+  TensorExprFuser fuser(graph, min_group_size, disable_shape_checks);
   fuser.run();
 
   EliminateCommonSubexpression(graph);
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index db2ee0482960..1474fba0ffa4 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -10,9 +10,15 @@ namespace jit {
 struct Graph;
 
 // Run TensorExpressions-based fuser.
+//
+// If shape checks are disabled it is the responsibilty of
+// the caller to ensure that the resultant subgraph is correctly
+// annotated with shapes by the time "getOperation" is called
+// on the node.
 TORCH_API void FuseTensorExprs(
     std::shared_ptr<Graph>& graph,
-    size_t min_group_size = 2);
+    size_t min_group_size = 2,
+    bool disable_shape_checks = false);
 
 TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();
@@ -23,6 +29,9 @@ TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
 TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
 
+TORCH_API bool usedOnlyInSize(Value* v);
+TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
+
 namespace tensorexpr {
 TORCH_API bool isSupported(Node* node);
 }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index 6c9aad77cf93..73976cb66bc8 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/passes/canonicalize.h>
 
 namespace torch {
 namespace jit {
@@ -9,6 +10,82 @@ bool hasSubgraph(Node* n) {
   return n->hasAttribute(attr::Subgraph);
 }
 
+std::vector<c10::optional<const Use>> gatherLastUses(
+    at::ArrayRef<Value*> values) {
+  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
+    return firstOrLastUse(v, /*find_first*/ false);
+  });
+}
+
+// When merging a node into a subgraph, we wish to preserve all of the
+// aliasing properties of the node's outputs. It is difficult to track
+// the node or its contained nodes through all of the ir manipulation
+// involved in merging; it is pretty easy to uniquely identify the value
+// based on its uses. We can identify the value by its last use in the graph.
+// Values which do not have uses or which do not have a last use
+// outside of the subgraph to be merged into we do not need to track.
+struct ValueMapper {
+  ValueMapper(Node* to_merge, AliasDb& db, size_t subgraph_num_outputs) {
+    last_uses_ = gatherLastUses(to_merge->outputs());
+    subgraph_num_outputs_ = subgraph_num_outputs;
+    WithInsertPoint guard(to_merge);
+    auto g = to_merge->owningGraph();
+    // temporary node to put the aliasing properties of the node before its
+    // merged and destroyed
+    placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0));
+    for (size_t i = 0; i < to_merge->outputs().size(); ++i) {
+      Value* existing = to_merge->outputs().at(i);
+      Value* new_value = placeholder_node_->insertOutput(i)->copyMetadata(
+          to_merge->outputs().at(i));
+      db.replaceWithNewValue(existing, new_value);
+    }
+  }
+
+  bool usesEqual(const Use& a, const Use& b) {
+    return a.user == b.user && a.offset == b.offset;
+  }
+
+  void copyAliasing(Node* merged_node, AliasDb& db) {
+    auto num_outputs = merged_node->outputs().size();
+    auto new_outputs = merged_node->outputs().slice(
+        subgraph_num_outputs_, num_outputs - subgraph_num_outputs_);
+    for (Value* v : new_outputs) {
+      auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false);
+      // if it doesnt have a use it shouldnt have been added as output
+      TORCH_INTERNAL_ASSERT(maybe_last_use);
+      const Use last_use = *maybe_last_use;
+      size_t i = 0;
+      while (i < last_uses_.size() && last_uses_.at(i).has_value() &&
+             !usesEqual(*last_uses_.at(i), last_use)) {
+        ++i;
+      }
+      TORCH_INTERNAL_ASSERT(i != last_uses_.size());
+      db.replaceWithNewValue(placeholder_node_->outputs().at(i), v);
+    }
+    placeholder_node_->destroy();
+  }
+
+  std::vector<c10::optional<const Use>> last_uses_;
+  size_t subgraph_num_outputs_;
+  Node* placeholder_node_;
+};
+
+Node* executeSubgraphMergeAndUpdateAliasing(
+    Node* to_merge,
+    c10::optional<Node*> existing,
+    AliasDb& db,
+    const std::function<Node*(void)>& merge_fn) {
+  // When we merge a node into a subgraph, the new subgraph outputs
+  // have the same aliasing properties as the original node's outputs.
+  // Here we create a placeholder node, transfer the aliasing properties
+  // to the placeholder, execute the merge, and transfer the aliasing
+  // properties to the appropriate fusion group outputs
+  ValueMapper vm(to_merge, db, existing ? (*existing)->outputs().size() : 0);
+  Node* fusion_group = merge_fn();
+  vm.copyAliasing(fusion_group, db);
+  return fusion_group;
+}
+
 // Combine the nodes in two subgraph together. The nodes will end up in
 // `mergeTo`, and `mergeFrom` is destroyed.
 void mergeSubgraph(
@@ -40,11 +117,20 @@ void mergeSubgraph(
   // Now we're merging the "unmerged" nodes into the mergeFrom subgraph. That
   // will give us a new map: "unmerged" -> "merged".
   std::unordered_map<Value*, Value*> merge_vmap;
+
+  // defer destroying nodes until after all nodes have been merged, otherwise we
+  // run into lifetime issues where the previous mapping of the merged nodes
+  // inputs/outputs can be overwritten with newly created values
+  std::vector<Node*> merged_nodes;
   while (it != end_it) {
-    // NB: mergeNodeIntoSubgraph destroys node, hence the complications
     Node* node = *it;
     ++it;
-    mergeNodeIntoSubgraph(node, mergeTo, merge_vmap);
+    merged_nodes.push_back(node);
+    mergeNodeIntoSubgraph(node, mergeTo, merge_vmap, /*destroyNode*/ false);
+  }
+
+  for (Node* n : merged_nodes) {
+    n->destroy();
   }
 
   // Vmap should contain "original" -> "merged" mapping, thus we basically need
@@ -151,7 +237,8 @@ std::unordered_set<Value*> closedOverValues(
 void mergeNodeIntoSubgraph(
     Node* toMerge,
     Node* subgraphNode,
-    std::unordered_map<Value*, Value*>& vmap) {
+    std::unordered_map<Value*, Value*>& vmap,
+    bool destroyNode) {
   AT_ASSERT(hasSubgraph(subgraphNode) && toMerge != subgraphNode);
   if (hasSubgraph(toMerge)) {
     return mergeSubgraph(subgraphNode, toMerge, vmap);
@@ -257,11 +344,17 @@ void mergeNodeIntoSubgraph(
     }
   }
   // Remove the original node now that the merge is complete
-  toMerge->destroy();
+  if (destroyNode) {
+    toMerge->destroy();
+  }
 }
-void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode) {
+
+void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    bool destroyNode) {
   std::unordered_map<Value*, Value*> vmap;
-  mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap);
+  mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap, destroyNode);
 }
 
 Node* createSingletonSubgraph(
@@ -281,6 +374,26 @@ Node* createSingletonSubgraph(Node* n, Symbol subgraphKind) {
   return createSingletonSubgraph(n, subgraphKind, vmap);
 }
 
+void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db) {
+  executeSubgraphMergeAndUpdateAliasing(to_merge, subgraphNode, db, [&]() {
+    mergeNodeIntoSubgraph(to_merge, subgraphNode);
+    return subgraphNode;
+  });
+}
+
+Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db) {
+  return executeSubgraphMergeAndUpdateAliasing(
+      to_merge, c10::nullopt, db, [&]() {
+        return createSingletonSubgraph(to_merge, subgraphKind);
+      });
+}
+
 } // namespace SubgraphUtils
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h
index 94150258b5fa..c0ffc3635031 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.h
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
@@ -26,16 +27,35 @@ TORCH_API Node* createSingletonSubgraph(
     Symbol subgraphKind,
     std::unordered_map<Value*, Value*>& vmap);
 
+// Creates a new subgraph that only contains `n`, amd udpates the new outputs
+// of the subgraph to have the aliasing properties of the original `n` outputs
+TORCH_API Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db);
+
 // Merge a node into a subgraph node. If `toMerge` is also a subgraph, the
 // subgraphs are merged.
-// `toMerge` is destroyed.
+// If `destroyNode` is true `toMerge` is destroyed.
 // An optional argument 'vmap' could be used to retrieve value mappings.
 // Values will be mapped to their new subgraph values
-TORCH_API void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode);
 TORCH_API void mergeNodeIntoSubgraph(
     Node* toMerge,
     Node* subgraphNode,
-    std::unordered_map<Value*, Value*>& vmap);
+    bool destroyNode = true);
+TORCH_API void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    std::unordered_map<Value*, Value*>& vmap,
+    bool destroyNode = true);
+
+// Merges a node into a subgraph node, and updates the new outputs of the
+// subgraph to have the aliasing properties of the corresponding `to_merge`
+// outputs
+TORCH_API void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db);
 
 // Move nodes from a subgraph node to the outer graph.
 // `subgraphNode` is destroyed.
diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
index 3ebfab1d3264..5c89c6aceab3 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
@@ -390,7 +390,7 @@ script::Module optimizeForMobile(
   if (!optimization_blocklist.count(MobileOptimizerType::FUSE_ADD_RELU)) {
     FuseAddRelu(cloned_module);
   }
-
+  cloned_module.register_attribute("mobile_optimized", BoolType::get(), true);
   return cloned_module;
 }
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 052b22a78917..1c6a5c968ce3 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -139,6 +139,13 @@ void initJITBindings(PyObject* module) {
       .def("_jit_pass_onnx_remove_print", RemovePrintOps)
       .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops)
       .def("_jit_pass_onnx", ToONNX)
+      .def(
+          "_jit_pass_onnx_assign_output_shape",
+          [](std::shared_ptr<Graph>& graph,
+             const std::vector<at::Tensor>& tensors,
+             bool onnx_shape_inference = false) {
+            ONNXAssignOutputShape(graph, tensors, onnx_shape_inference);
+          })
       .def("_jit_pass_lower_all_tuples", LowerAllTuples)
       .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution)
       .def(
@@ -188,7 +195,17 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_onnx_prepare_inplace_ops_for_onnx",
           PrepareInplaceOpsForONNX)
-      .def("_jit_pass_onnx_node_shape_type_inference", ONNXShapeTypeInference)
+      .def(
+          "_jit_pass_onnx_node_shape_type_inference",
+          [](Node* n, int opset_version) {
+            ONNXShapeTypeInference(n, opset_version);
+          })
+      .def(
+          "_jit_pass_onnx_graph_shape_type_inference",
+          [](std::shared_ptr<Graph>& graph, int opset_version) {
+            ONNXShapeTypeInference(graph, opset_version);
+          })
+      .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape)
       .def("_jit_pass_fuse", FuseGraph)
       .def(
           "_jit_pass_dce",
@@ -284,12 +301,15 @@ void initJITBindings(PyObject* module) {
           [](Module& module) { SwapFunctionalLinear(module); })
       .def(
           "_jit_pass_quant_finalize",
-          [](Module& module, int quant_type_int) {
+          [](Module& module,
+             int quant_type_int,
+             const std::vector<std::string>& preserved_attrs) {
             auto quant_type = static_cast<QuantType>(quant_type_int);
-            return Finalize(module, quant_type);
+            return Finalize(module, quant_type, preserved_attrs);
           },
           py::arg("module"),
-          py::arg("quant_type_int") = 1)
+          py::arg("quant_type_int") = 1,
+          py::arg("preserved_attrs") = std::vector<std::string>())
       .def(
           "_jit_pass_pattern_based_rewrite",
           [](const Module& m) { return PatternBasedRewrite(m); })
@@ -449,6 +469,7 @@ void initJITBindings(PyObject* module) {
           })
       .def("_jit_pass_onnx_block", BlockToONNX)
       .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode)
+      .def("_jit_pass_fixup_onnx_loop_node_inputs", FixupONNXLoopNodeInputs)
       .def("_jit_pass_canonicalize_graph_fuser_ops", CanonicalizeOps)
       .def("_jit_pass_decompose_ops", DecomposeOps)
       .def("_jit_pass_specialize_autogradzero", specializeAutogradZero)
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 65f5a49145c8..4be55a9caa90 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) {
   if (py::isinstance<Object>(input)) {
     auto object = py::cast<Object>(input);
     return InferredType(object.type());
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
     auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
     return InferredType(rref_ivalue.type());
@@ -716,7 +716,7 @@ inline IValue toIValue(
       }
     }
     case TypeKind::RRefType: {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
       return obj.cast<torch::distributed::rpc::PyRRef>().toIValue();
 #else
       AT_ERROR("RRef is only supported with the distributed package");
@@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) {
     }
     return std::move(py_dict);
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     auto RRefPtr =
         c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
             std::move(ivalue).toRRef());
@@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) {
     auto py_class = getScriptedClassOrError(qualified_class_name);
     return py_class.attr(enum_holder->name().c_str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return py::cast(torch::distributed::rpc::PyRRef(
         c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
             ivalue.toRRef())));
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 46a6448ee310..78d11e79eb03 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -236,8 +236,10 @@ void initPythonIRBindings(PyObject* module_) {
              bool use_external_data_format,
              const std::string& onnx_file_path) {
             std::string graph;
+            std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto;
             RawDataExportMap export_map;
-            std::tie(graph, export_map) = export_onnx(
+            SymbolDimMap symbol_map;
+            std::tie(model_proto, export_map, symbol_map) = export_onnx(
                 g,
                 initializers,
                 onnx_opset_version,
@@ -250,6 +252,7 @@ void initPythonIRBindings(PyObject* module_) {
                 add_node_names,
                 use_external_data_format,
                 onnx_file_path);
+            graph = serialize_model_proto_to_string(model_proto);
             std::unordered_map<std::string, py::bytes>
                 python_serialized_export_map;
             for (auto& kv : export_map) {
@@ -261,6 +264,7 @@ void initPythonIRBindings(PyObject* module_) {
               python_serialized_export_map[kv.first] =
                   py::bytes(static_cast<const char*>(t.data_ptr()), copy_bytes);
             }
+            graph = serialize_model_proto_to_string(model_proto);
             return std::make_tuple(
                 py::bytes(graph), python_serialized_export_map);
           },
@@ -468,8 +472,14 @@ void initPythonIRBindings(PyObject* module_) {
           })
       .def("returnNode", [](Block& b) { return b.return_node(); })
       .def("paramNode", [](Block& b) { return b.param_node(); })
-      .def("addNode", [](Block& b, Value& input, const char* str) {
-        return addNodeToBlock(&b, &input, Symbol::fromQualString(str));
+      .def(
+          "addNode",
+          [](Block& b, const char* str, const std::vector<Value*>& inputs) {
+            return addNodeToBlock(&b, Symbol::fromQualString(str), inputs);
+          })
+      .def("addInputToBlock", [](Block& b) { return addInputToBlock(&b); })
+      .def("registerOutput", [](Block& b, Value* value) {
+        return b.registerOutput(value);
       });
 
 #define NS(name) def(#name, &Node ::name)
@@ -737,7 +747,8 @@ void initPythonIRBindings(PyObject* module_) {
   py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
       .def_static("get", &FloatType::get);
   py::class_<TensorType, Type, std::shared_ptr<TensorType>>(m, "TensorType")
-      .def_static("get", &TensorType::get);
+      .def_static("get", &TensorType::get)
+      .def_static("getInferred", &TensorType::getInferred);
   py::class_<BoolType, Type, std::shared_ptr<BoolType>>(m, "BoolType")
       .def_static("get", &BoolType::get);
   py::class_<StringType, Type, std::shared_ptr<StringType>>(m, "StringType")
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index ba94d33f37b3..119b6b5e5de7 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -916,7 +916,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
     return SpecialFormValue::create(prim::annotate);
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on.
   } else if (
       obj.ptr() ==
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 0aa46a0139ba..43a11f75a768 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -23,25 +23,44 @@ namespace tracer {
 
 // Python interpreter retrieval routine adapted from
 // https://stackoverflow.com/a/8706144
-SourceRange getPythonInterpreterSourceRange() {
-  c10::optional<std::string> source_filename;
-  size_t source_line = 0;
-  std::stringstream stack_trace;
-
+std::vector<StackEntry> _pythonCallstack() {
   pybind11::gil_scoped_acquire gil;
   PyFrameObject* frame = PyEval_GetFrame();
+  std::vector<StackEntry> entries;
 
   while (nullptr != frame) {
-    int line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+    size_t line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
     std::string filename = THPUtils_unpackString(frame->f_code->co_filename);
     std::string funcname = THPUtils_unpackString(frame->f_code->co_name);
-    stack_trace << filename << "(" << line << "): " << funcname << "\n";
-    if (!source_filename) {
-      source_filename = filename;
-      source_line = line;
-    }
+    auto source = std::make_shared<Source>(funcname, filename, line);
+    entries.emplace_back(
+        StackEntry{funcname, SourceRange(source, 0, funcname.size())});
     frame = frame->f_back;
   }
+  return entries;
+}
+
+SourceRange getPythonInterpreterSourceRange() {
+  auto cs = pythonCallstack();
+  c10::optional<std::string> source_filename;
+  size_t source_line = 0;
+  std::stringstream stack_trace;
+  for (const auto& entry : cs) {
+    auto& range = entry.range;
+    if (range.source()) {
+      auto& src = range.source();
+      if (src && src->filename()) {
+        auto line =
+            src->starting_line_no() + src->lineno_for_offset(range.start());
+        stack_trace << *(src->filename()) << "(" << line
+                    << "): " << entry.filename << "\n";
+        if (!source_filename) {
+          source_filename = *(src->filename());
+          source_line = line;
+        }
+      }
+    }
+  }
 
   auto stack_trace_text = stack_trace.str();
   auto source =
@@ -123,6 +142,7 @@ void pythonWarn(const std::string& reason) {
 }
 
 void initPythonTracerBindings(PyObject* module) {
+  setPythonCallstack(_pythonCallstack);
   setRecordSourceLocation(pythonRecordSourceLocation);
 
   auto m = py::handle(module).cast<py::module>();
diff --git a/torch/csrc/jit/python/python_tracer.h b/torch/csrc/jit/python/python_tracer.h
index 9797a1e32e50..5d8e3a9a52ea 100644
--- a/torch/csrc/jit/python/python_tracer.h
+++ b/torch/csrc/jit/python/python_tracer.h
@@ -16,7 +16,6 @@ struct Module;
 namespace tracer {
 void initPythonTracerBindings(PyObject* module);
 
-std::string getPythonInterpreterStackTrace();
 SourceRange getPythonInterpreterSourceRange();
 
 Node* preRecordPythonTrace(
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 5ed9ba9dc0a7..5e4031fdf435 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -53,6 +53,7 @@ using ::c10::FunctionSchema;
 
 using ResolutionCallback = std::function<py::function(std::string)>;
 using FunctionDefaults = std::unordered_map<std::string, py::object>;
+using ClassMethodDefaults = std::unordered_map<std::string, FunctionDefaults>;
 
 namespace {
 
@@ -146,11 +147,11 @@ struct PythonResolver : public Resolver {
   ClassTypePtr classType_;
 };
 
-std::shared_ptr<PythonResolver> pythonResolver(ResolutionCallback rcb) {
+std::shared_ptr<PythonResolver> pythonResolver(const ResolutionCallback& rcb) {
   return std::make_shared<PythonResolver>(rcb);
 }
 std::shared_ptr<PythonResolver> pythonResolver(
-    ResolutionCallback rcb,
+    const ResolutionCallback& rcb,
     std::string classname,
     ClassTypePtr classType) {
   return std::make_shared<PythonResolver>(
@@ -490,21 +491,6 @@ static std::shared_ptr<Graph> _propagate_and_assign_input_shapes(
   return retval;
 }
 
-static std::shared_ptr<Graph> _assign_output_shapes(
-    Graph& graph,
-    std::vector<at::Tensor> outputs) {
-  auto retval = graph.copy();
-  AT_ASSERT(retval->outputs().size() == outputs.size());
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto scalar_type = outputs[i].scalar_type();
-    auto sizes = outputs[i].sizes();
-    auto type =
-        torch::jit::TensorType::createContiguous(scalar_type, at::kCPU, sizes);
-    retval->outputs()[i]->setType(type);
-  }
-  return retval;
-}
-
 void addFunctionToModule(Module& module, const StrongFunctionPtr& func) {
   // Make a graph with a fake self argument
   auto graph = func.function_->graph()->copy();
@@ -640,7 +626,7 @@ struct slot_dict_impl {
 template <typename T>
 py::list debugMakeList(const T& list) {
   py::list result;
-  for (auto elem : list) {
+  for (const auto& elem : list) {
     result.append(py::cast(elem));
   }
   return result;
@@ -680,7 +666,7 @@ static py::dict _jit_debug_module_iterators(Module& module) {
   return result;
 }
 
-static constexpr const char* magic_method_names[] = {
+static constexpr std::array<const char*, 47> magic_method_names = {
     "__lt__",      "__le__",      "__eq__",        "__ne__",
     "__ge__",      "__gt__",      "__not__",       "__abs__",
     "__add__",     "__and__",     "__floordiv__",  "__index__",
@@ -805,7 +791,8 @@ void initJitScriptBindings(PyObject* module) {
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
               },
-              [](std::tuple<py::object, std::string> state_tup) -> Object {
+              [](const std::tuple<py::object, std::string>& state_tup)
+                  -> Object {
                 py::object state;
                 std::string qualname;
                 std::tie(state, qualname) = state_tup;
@@ -969,7 +956,7 @@ void initJitScriptBindings(PyObject* module) {
           [](Module& m,
              std::shared_ptr<ConcreteModuleType> concreteType,
              const std::string& script,
-             ResolutionCallback rcb) {
+             const ResolutionCallback& rcb) {
             const auto self = ModuleSelf(std::move(concreteType));
             m._ivalue()->compilation_unit()->define(
                 *m.type()->name(), script, pythonResolver(rcb), &self);
@@ -979,7 +966,7 @@ void initJitScriptBindings(PyObject* module) {
           "_register_attribute",
           [](Module& m,
              const std::string& name,
-             TypePtr type,
+             const TypePtr& type,
              py::handle value) {
             m.register_attribute(name, type, toIValue(value, type));
           })
@@ -987,9 +974,9 @@ void initJitScriptBindings(PyObject* module) {
           "_create_method_from_trace",
           [](Module& self,
              const std::string& name,
-             py::function func,
-             py::tuple input_tuple,
-             py::function var_lookup_fn,
+             const py::function& func,
+             const py::tuple& input_tuple,
+             const py::function& var_lookup_fn,
              bool strict,
              bool force_outplace) {
             // prereq: Module's buffers and parameters are unique
@@ -1105,7 +1092,7 @@ void initJitScriptBindings(PyObject* module) {
           "define",
           [](CompilationUnit& cu,
              const std::string& src,
-             ResolutionCallback rcb) {
+             const ResolutionCallback& rcb) {
             cu.define(c10::nullopt, src, pythonResolver(rcb), nullptr);
           })
       .def(
@@ -1278,10 +1265,10 @@ void initJitScriptBindings(PyObject* module) {
       });
   m.def(
       "_create_function_from_trace",
-      [](std::string qualname,
-         py::function func,
-         py::tuple input_tuple,
-         py::function var_lookup_fn,
+      [](const std::string& qualname,
+         const py::function& func,
+         const py::tuple& input_tuple,
+         const py::function& var_lookup_fn,
          bool strict,
          bool force_outplace) {
         auto typed_inputs = toTraceableStack(input_tuple);
@@ -1301,7 +1288,8 @@ void initJitScriptBindings(PyObject* module) {
       "_jit_script_class_compile",
       [](const std::string& qualifiedName,
          const ClassDef& classDef,
-         ResolutionCallback rcb) {
+         const ClassMethodDefaults& defaults,
+         const ResolutionCallback& rcb) {
         C10_LOG_API_USAGE_ONCE("torch.script.class");
         if (classDef.superclass().present()) {
           throw ErrorReport(classDef.range())
@@ -1339,6 +1327,30 @@ void initJitScriptBindings(PyObject* module) {
 
         const auto self = SimpleSelf(classType);
         cu->define(classname, props, propRcbs, methodDefs, methodRcbs, &self);
+
+        // Stitch in default arguments for methods. Properties don't need to be
+        // considered since there is no way to invoke setters without passing in
+        // a value.
+        auto defs_it = methodDefs.begin();
+        while (defs_it != methodDefs.end()) {
+          auto def_name = (*defs_it).name().name();
+          // If the method is not in the defaults map, assume there are
+          // no default arguments for it.
+          auto default_it = defaults.find(def_name);
+          if (default_it == defaults.end()) {
+            continue;
+          }
+
+          const auto method_name =
+              QualifiedName(classname, (*defs_it).name().name());
+          auto& method = cu->get_function(method_name);
+          method.setSchema(getSchemaWithNameAndDefaults(
+              defs_it->range(),
+              method.getSchema(),
+              at::nullopt,
+              default_it->second));
+          ++defs_it;
+        }
       });
   m.def(
       "_jit_script_interface_compile",
@@ -1439,7 +1451,6 @@ void initJitScriptBindings(PyObject* module) {
   m.def("_propagate_shapes", _propagate_shapes);
   m.def(
       "_propagate_and_assign_input_shapes", _propagate_and_assign_input_shapes);
-  m.def("_assign_output_shapes", _assign_output_shapes);
   m.def(
       "_last_executed_optimized_graph",
       []() { return lastExecutedOptimizedGraph(); },
@@ -1642,18 +1653,23 @@ void initJitScriptBindings(PyObject* module) {
 
   m.def(
       "_resolve_type",
-      [](const std::string& name, SourceRange range, ResolutionCallback rcb) {
+      [](const std::string& name,
+         const SourceRange& range,
+         const ResolutionCallback& rcb) {
         return pythonResolver(rcb)->resolveType(name, range);
       });
   m.def(
       "_resolve_type_from_object",
-      [](const py::object& obj, SourceRange range, ResolutionCallback rcb) {
+      [](const py::object& obj,
+         const SourceRange& range,
+         const ResolutionCallback& rcb) {
         return pythonResolver(rcb)->resolveTypeFromObject(obj, range);
       });
 
   m.def(
       "_run_emit_module_hook", [](const Module& m) { didFinishEmitModule(m); });
 
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<logging::LoggerBase, std::shared_ptr<logging::LoggerBase>>(
       m, "LoggerBase");
   py::enum_<logging::LockingLogger::AggregationType>(m, "AggregationType")
diff --git a/torch/csrc/jit/runtime/instruction.h b/torch/csrc/jit/runtime/instruction.h
index 8cfbb17e7685..dae7a0bcad3f 100644
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@@ -52,7 +52,7 @@ namespace jit {
   _(ISINSTANCE, "TI") /* check object is one of  types[X:X+N]  */              \
   _(TUPLE_SLICE, "II") /* slice tup[X:(X+N)] */                                \
   _(FORK, "CN") /* launch a thread to run code entry x with N inputs  */       \
-  _(WARN, "") /* emit a warning with line information */                       \
+  _(WARN, "I") /* emit a warning with line information */                      \
   _(ENTER, "EN") /* enter scope of a contextmanager */                         \
   _(EXIT, "EX") /* exit the last entered contextmanager */
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 337fe66c0789..d06d4279a31a 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -7,6 +7,7 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/api/function_impl.h>
@@ -23,7 +24,7 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/autograd/context/container.h>
 using torch::distributed::autograd::DistAutogradContainer;
 #endif
@@ -267,7 +268,7 @@ void insertLastUses(Graph& g) {
 }
 
 inline int64_t getDistAutogradContextId() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   return DistAutogradContainer::currentContextId();
 #else
   return 0;
@@ -412,6 +413,31 @@ struct BailoutBlock {
   std::vector<Instruction> instructions; // ends in a TAIL_CALL
 };
 
+thread_local InterpreterStateImpl* tls_int_state_ptr_ = nullptr;
+struct TLSCurrentInterpreterGuard {
+  TLSCurrentInterpreterGuard(InterpreterStateImpl* state) {
+    prev_state_ = tls_int_state_ptr_;
+    tls_int_state_ptr_ = state;
+  }
+
+  ~TLSCurrentInterpreterGuard() {
+    tls_int_state_ptr_ = prev_state_;
+  }
+
+ private:
+  InterpreterStateImpl* prev_state_;
+};
+
+template <class Ttarget, class Tsource>
+Ttarget safe_narrow_cast(Tsource v) {
+  Ttarget res = static_cast<Ttarget>(v);
+  // Casting it back to check whether it overflew.
+  if (static_cast<Tsource>(res) != v) {
+    throw std::runtime_error("safe_narrow_cast<>() failed due to overflow");
+  }
+  return res;
+}
+
 struct CodeImpl {
   friend struct InterpreterState;
   std::vector<Instruction> instructions_;
@@ -519,7 +545,10 @@ struct CodeImpl {
   }
 
   void insertInstruction(OpCode op, int64_t X = 0, uint64_t N = 0) {
-    instructions_.emplace_back(op, X, N);
+    instructions_.emplace_back(
+        op,
+        safe_narrow_cast<int32_t, int64_t>(X),
+        safe_narrow_cast<int16_t, int64_t>(N));
     instructions_source_.emplace_back(current_node_);
 
     // check that we didn't accidentally emit nodes out of topological order
@@ -857,7 +886,11 @@ struct CodeImpl {
 
   void emitWarn(Node* node) {
     emitLoadInputs(node->inputs());
-    insertInstruction(WARN);
+    int32_t idx = -1;
+    if (node->hasAttribute(attr::warn_id)) {
+      idx = static_cast<int32_t>(node->i(attr::warn_id));
+    }
+    insertInstruction(WARN, idx);
   }
 
   void emitEnter(Node* node) {
@@ -1001,6 +1034,22 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 
  private:
+  struct WarnedNodes {
+   public:
+    // Inserts idx into warned_nodes_, returns a boolean indicates whether
+    // insertion actually happened (idx wasn't originally in the set).
+    bool insert(int32_t idx) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      return warned_nodes_.insert(idx).second;
+    }
+
+   private:
+    std::mutex mutex_;
+    std::unordered_set<int32_t> warned_nodes_;
+  };
+
+  WarnedNodes warned_nodes_;
+
   // if we need to suspend, where do we reset the stack?
   // answer: to where it was when we were called, not
   // including any inputs to this function
@@ -1044,30 +1093,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
 
     // RecordFunction object associated with this frame
     std::unique_ptr<at::RecordFunction> record_function;
+
     // symbol table for a frame
     ShapeSymbolTable symbols2dims;
   };
 
-  // saved-by-value stuff that can exist on the stack inside runInterpreter
-  struct ActiveFrame {
-    size_t pc;
-    Instruction* instructions;
-    IValue* constants;
-    Operation* operators;
-    Function** functions;
-    std::function<void(std::vector<IValue>&)>* profile_functions;
-    TypePtr* types;
-
-    ActiveFrame(const Frame& frame)
-        : pc(frame.pc),
-          instructions(frame.function->instructions_.data()),
-          constants(frame.function->constant_table_.data()),
-          operators(frame.function->operator_table_.data()),
-          functions(frame.function->function_table_.data()),
-          profile_functions(frame.function->profile_function_table_.data()),
-          types(frame.function->type_table_.data()) {}
-  };
-
   std::vector<Frame> frames;
 
   c10::intrusive_ptr<InterpreterStateImpl> intrusive_from_this() {
@@ -1078,7 +1108,6 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   void enterFrame(const Code& code, size_t base_pointer) {
     frames.emplace_back(Frame{code.pImpl, 0, base_pointer, c10::nullopt});
     registers.resize(registers.size() + code.pImpl->register_size_);
-    // frames.back().function->dump(std::cout);
   }
 
   void leaveFrame() {
@@ -1101,16 +1130,16 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     }
   }
 
-  void runBuiltinFunction(Stack& stack, Function* fn, ActiveFrame* af) {
+  void runBuiltinFunction(Stack& stack, Function* fn) {
     // BuiltinOpFunction directly invokes a void(Stack&) to implement
     // custom C++ classes. Call run() here with the stack, and we will
     // get the results from that C++ method back in the stack. Advance
     // the PC by 1 without adding any new frame.
     fn->run(stack);
-    ++af->pc;
+    ++frames.back().pc;
   }
 
-  void runGraphFunction(Stack& stack, Function* fn, ActiveFrame* af) {
+  void runGraphFunction(Stack& stack, Function* fn) {
     const Code& code =
         // consider passing
         // `frames.back().function->remaining_bailout_depth_` into
@@ -1122,21 +1151,9 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         fn->get_executor()
             .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
             .code;
-    frames.back().pc = af->pc + 1;
+    ++frames.back().pc;
     enterFrame(code, stack.size() - code.num_inputs());
-    if (at::hasCallbacks() && at::isRecordFunctionEnabled()) {
-      auto rec_fn = std::make_unique<at::RecordFunction>(
-          at::RecordScope::TORCHSCRIPT_FUNCTION);
-      if (rec_fn->active) {
-        if (rec_fn->needs_inputs) {
-          rec_fn->before(fn->name(), last(stack, code.num_inputs()));
-        } else {
-          rec_fn->before(fn->name());
-        }
-        frames.back().record_function = std::move(rec_fn);
-      }
-    }
-    *af = ActiveFrame(frames.back());
+    checkAndStartRecordFunction(frames.back(), stack);
   }
 
   bool runImpl(Stack& stack) {
@@ -1152,18 +1169,22 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       stack_start_ = 0;
     }
 
-    ActiveFrame af(frames.back());
+    TLSCurrentInterpreterGuard g(this);
+    if (frames.back().pc == 0 && stack_start_ == 0) {
+      checkAndStartRecordFunction(frames.back(), stack);
+    }
     try {
       while (true) {
+        Frame& frame = frames.back();
         // std::cout << "RUNNING ";
-        // frames.back().function->dump(std::cout, af.pc);
-        Instruction inst = af.instructions[af.pc];
+        // frames.back().function->dump(std::cout, frame.pc);
+        Instruction inst = frame.function->instructions_[frame.pc];
         switch (inst.op) {
           case ENTER: {
             auto obj = peek(stack, 0, 1);
             TORCH_INTERNAL_ASSERT(obj.isObject());
             entered_objects.push_back(obj);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case EXIT: {
             auto obj = entered_objects.back().toObject();
@@ -1173,90 +1194,90 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             push(stack, IValue());
             push(stack, IValue());
             push(stack, IValue());
-            runGraphFunction(stack, &f, &af);
+            runGraphFunction(stack, &f);
           } break;
           case OP:
-            af.operators[inst.X](&stack);
-            ++af.pc;
+            frame.function->operator_table_[inst.X](&stack);
+            ++frame.pc;
             break;
           case OPN:
             stack.push_back(inst.N);
-            af.operators[inst.X](&stack);
-            ++af.pc;
+            frame.function->operator_table_[inst.X](&stack);
+            ++frame.pc;
             break;
           case LOAD:
             stack.emplace_back(reg(inst.X));
-            ++af.pc;
+            ++frame.pc;
             break;
           case MOVE:
             stack.emplace_back(std::move(reg(inst.X)));
-            ++af.pc;
+            ++frame.pc;
             break;
           case STORE:
             reg(inst.X) = pop(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           case STOREN:
             for (size_t i = inst.N; i > 0; --i) {
               reg(inst.X + i - 1) = pop(stack);
             }
-            ++af.pc;
+            ++frame.pc;
             break;
           case DROP:
             pop(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           case DROPR:
             reg(inst.X) = IValue();
-            ++af.pc;
+            ++frame.pc;
             break;
           case LOADC:
-            stack.emplace_back(af.constants[inst.X]);
-            ++af.pc;
+            stack.emplace_back(frame.function->constant_table_[inst.X]);
+            ++frame.pc;
             break;
           case GET_ATTR: {
             auto userObj = pop(stack).toObject();
             auto value = userObj->getSlot(inst.X);
             push(stack, std::move(value));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case SET_ATTR: {
             auto v = pop(stack);
             auto userObj = pop(stack).toObject();
             userObj->setSlot(inst.X, std::move(v));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case JF:
-            af.pc += (pop(stack).toBool()) ? 1 : inst.X;
+            frame.pc += (pop(stack).toBool()) ? 1 : inst.X;
             break;
           case JMP:
-            af.pc += inst.X;
+            frame.pc += inst.X;
             break;
           case LOOP: {
             // stack: iteration_count, max_iter, cond, loop_carried_deps...
-            auto frame = stack.end() - (inst.N + 1);
-            int64_t trip_count = frame[0].toInt();
-            int64_t max_trip_count = frame[1].toInt();
-            bool cond = frame[2].toBool();
+            auto fr = stack.end() - (inst.N + 1);
+            int64_t trip_count = fr[0].toInt();
+            int64_t max_trip_count = fr[1].toInt();
+            bool cond = fr[2].toBool();
             if (trip_count < max_trip_count && cond) {
-              frame[2] = trip_count;
-              frame[0] = trip_count + 1;
-              ++af.pc;
+              fr[2] = trip_count;
+              fr[0] = trip_count + 1;
+              ++frame.pc;
             } else {
               size_t n_loop_carried = inst.N - 2;
               for (size_t i = 0; i < n_loop_carried; ++i) {
-                frame[i] = std::move(frame[i + 3]);
+                fr[i] = std::move(fr[i + 3]);
               }
               drop(stack, 3); // iteration_count, max_iter, cond
-              af.pc += inst.X;
+              frame.pc += inst.X;
             }
           } break;
           case CALL: {
-            Function* fn = af.functions[inst.X];
+            Function* fn = frame.function->function_table_[inst.X];
             if (!fn->isGraphFunction()) {
-              runBuiltinFunction(stack, fn, &af);
+              runBuiltinFunction(stack, fn);
             } else {
-              runGraphFunction(stack, fn, &af);
+              runGraphFunction(stack, fn);
             }
           } break;
           case INTERFACE_CALL: {
@@ -1276,17 +1297,17 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 peek(stack, 0, inst.N)
                     .toObject()
                     ->type()
-                    ->getMethod(af.constants[inst.X].toStringRef());
+                    ->getMethod(
+                        frame.function->constant_table_[inst.X].toStringRef());
             if (!function.isGraphFunction()) {
-              runBuiltinFunction(stack, &function, &af);
+              runBuiltinFunction(stack, &function);
             } else {
-              runGraphFunction(stack, &function, &af);
+              runGraphFunction(stack, &function);
             }
           } break;
           case RET:
             if (frames.size() > 1) {
               leaveFrame();
-              af = ActiveFrame(frames.back());
               break;
             }
             if (future_) {
@@ -1298,6 +1319,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                     jit::last(stack, num_outputs).vec()));
               }
             }
+            // destroy the last frame and call RecordFunction's end callbacks
+            leaveFrame();
             return false;
           case WAIT: {
             auto future = stack.back().toFuture();
@@ -1343,7 +1366,6 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 stack.resize(stack_start_);
               }
               // save pc into the frame so we continue here when restored
-              frames.back().pc = af.pc;
               future->addCallback(
                   Callback(intrusive_from_this(), std::move(copied)));
 
@@ -1351,26 +1373,26 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             }
             stack.pop_back();
             stack.emplace_back(future->value());
-            ++af.pc;
+            ++frame.pc;
           } break;
           case PROFILE_OP: {
-            auto& frame_id_ref = frames.back().id;
+            auto& frame_id_ref = frame.id;
             if (!frame_id_ref.has_value()) {
               frame_id_ref = Frame::num_frames++;
             }
-            auto callback = af.profile_functions[inst.X];
+            auto callback = frame.function->profile_function_table_[inst.X];
             push(stack, c10::IValue{static_cast<int64_t>(*frame_id_ref)});
             callback(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case FAIL_GUARD: {
             // patch FAIL_GUARD back to GUARD
             GRAPH_DEBUG(
                 "Bailout ", inst.X, " triggered via bailout_requests_!");
-            af.instructions[af.pc].op = GUARD;
+            frame.function->instructions_[frame.pc].op = GUARD;
             push(stack, false);
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case TYPECHECK: {
@@ -1380,7 +1402,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             for (i = 0; i < num_inputs; i++) {
               auto& input = peek(stack, i, num_inputs);
               auto t = input.toTensor();
-              const TypePtr& expected = af.types[inst.X + i];
+              const TypePtr& expected = frame.function->type_table_[inst.X + i];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
                   (!frames.back().symbols2dims.bindSymbolicShapes(
@@ -1393,7 +1415,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             if (i == num_inputs) {
               push(stack, true);
             }
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case GUARD: {
@@ -1404,7 +1426,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
               push(stack, true);
             } else {
               auto t = stack.back().toTensor();
-              const TypePtr& expected = af.types[inst.X];
+              const TypePtr& expected = frame.function->type_table_[inst.X];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
                   !frames.back().symbols2dims.bindSymbolicShapes(
@@ -1414,21 +1436,21 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 push(stack, expected_type->matchTensor(t));
               }
             }
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TAIL_CALL: {
             GRAPH_DEBUG("running TAIL_CALL for ", inst.X);
-            af.functions[inst.X]->ensure_defined();
+            frame.function->function_table_[inst.X]->ensure_defined();
             size_t remaining_bailout_depth =
-                frames.back().function->remaining_bailout_depth_ > 0
-                ? frames.back().function->remaining_bailout_depth_ - 1
+                frame.function->remaining_bailout_depth_ > 0
+                ? frame.function->remaining_bailout_depth_ - 1
                 : 0;
-            const Code& code = af.functions[inst.X]
+            const Code& code = frame.function->function_table_[inst.X]
                                    ->get_executor()
                                    .getPlanFor(stack, remaining_bailout_depth)
                                    .code;
             size_t num_inputs = code.num_inputs();
-            size_t base_pointer = frames.back().base_pointer;
+            size_t base_pointer = frame.base_pointer;
             TORCH_INTERNAL_ASSERT(stack.size() >= num_inputs);
             size_t inputs_start = stack.size() - num_inputs;
             for (size_t i = 0; i < num_inputs; ++i) {
@@ -1438,49 +1460,52 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             stack.resize(base_pointer + num_inputs);
             leaveFrame();
             enterFrame(code, base_pointer);
-            af = ActiveFrame(frames.back());
+            checkAndStartRecordFunction(frames.back(), stack);
           } break;
           case LIST_UNPACK: {
             listUnpack(stack, inst.X);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TUPLE_CONSTRUCT: {
             tupleConstruct(stack, inst.X);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TUPLE_SLICE: {
             tupleSlice(stack, inst.X, inst.X + inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case NAMED_TUPLE_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<TupleType>();
+            auto type =
+                frame.function->type_table_[inst.X]->expect<TupleType>();
             namedTupleConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case LIST_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<ListType>();
+            auto type = frame.function->type_table_[inst.X]->expect<ListType>();
             listConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case DICT_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<DictType>();
+            auto type = frame.function->type_table_[inst.X]->expect<DictType>();
             dictConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case CREATE_OBJECT: {
-            auto type = af.types[inst.X]->expect<ClassType>();
+            auto type =
+                frame.function->type_table_[inst.X]->expect<ClassType>();
             createObject(stack, type);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case ISINSTANCE: {
             at::ArrayRef<TypePtr> types(
-                af.types + inst.X, af.types + inst.X + inst.N);
+                &(frame.function->type_table_[inst.X]),
+                &(frame.function->type_table_[inst.X + inst.N]));
             isinstance(stack, types);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case FORK: {
             // Move inputs to a separate stack
-            Function* forked_fn = af.functions[inst.X];
+            Function* forked_fn = frame.function->function_table_[inst.X];
             InterpreterState forked_interpreter(
                 forked_fn->get_executor()
                     .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
@@ -1492,31 +1517,44 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             drop(stack, inst.N);
             push(stack, forked_interpreter.getFuture());
             at::launch(std::move(continuation));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case WARN: {
-            Node* node = frames.back().function->instructions_source_.at(af.pc);
+            // Keeps track of which WARN instruction has been executed before,
+            // we only want to execute each WARN once to match default Python
+            // warning behavior.
+            bool need_warn = true;
+            if (inst.X != -1) {
+              need_warn = warned_nodes_.insert(inst.X);
+            }
+
+            Node* node =
+                frames.back().function->instructions_source_.at(frame.pc);
             auto range = node->sourceRange().source();
             if (range->filename()) {
-              auto line = range->starting_line_no() +
-                  range->lineno_for_offset(node->sourceRange().start());
               drop(stack, 1);
-              c10::SourceLocation location{
-                  "", range->filename()->c_str(), uint32_t(line)};
-              // Sends the warning to the warning handler with the
-              // "verbatim" flag. This flag ensures the warning handler
-              // will print the exception as configured.
-              c10::Warning::warn(
-                  location, pop(stack).toStringRef(), /*verbatim=*/true);
+              const auto msg = pop(stack).toStringRef();
+              if (need_warn) {
+                auto line = range->starting_line_no() +
+                    range->lineno_for_offset(node->sourceRange().start());
+                c10::SourceLocation location{
+                    "", range->filename()->c_str(), uint32_t(line)};
+                // Sends the warning to the warning handler with the
+                // "verbatim" flag. This flag ensures the warning handler
+                // will print the exception as configured.
+                c10::Warning::warn(location, msg, /*verbatim=*/true);
+              }
             } else {
-              TORCH_WARN(pop(stack).toStringRef());
+              const auto msg = pop(stack).toStringRef();
+              if (need_warn) {
+                TORCH_WARN(msg);
+              }
             }
-            ++af.pc;
+            ++frame.pc;
           } break;
         }
       }
     } catch (std::exception& e) {
-      frames.back().pc = af.pc;
       for (auto it = entered_objects.rbegin(), end = entered_objects.rend();
            it != end;
            ++it) {
@@ -1542,6 +1580,43 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 
   void formatStackTrace(std::ostream& out) {
+    format_stack_trace(out, callstack());
+  }
+
+  void handleError(const ExceptionMessage& msg, bool is_jit_exception) {
+    std::ostringstream ss;
+    ss << "The following operation failed in the TorchScript interpreter.\n";
+    formatStackTrace(ss);
+    ss << "RuntimeError: " << msg << "\n";
+    if (future_) {
+      future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
+    } else if (is_jit_exception) {
+      throw JITException(ss.str());
+    } else {
+      throw std::runtime_error(ss.str());
+    }
+  }
+
+  static void checkAndStartRecordFunction(Frame& frame, Stack& stack) {
+    if (!frame.record_function && at::hasCallbacks() &&
+        at::isRecordFunctionEnabled()) {
+      auto rec_fn = std::make_unique<at::RecordFunction>(
+          at::RecordScope::TORCHSCRIPT_FUNCTION);
+      if (rec_fn->active) {
+        if (rec_fn->needs_inputs) {
+          rec_fn->before(
+              frame.function->function_name_,
+              last(stack, frame.function->n_inputs));
+        } else {
+          rec_fn->before(frame.function->function_name_);
+        }
+        frame.record_function = std::move(rec_fn);
+      }
+    }
+  }
+
+ public:
+  std::vector<StackEntry> callstack() const {
     std::vector<StackEntry> entries;
     for (size_t i = 0; i < frames.size(); ++i) {
       const Frame& frame = frames[i];
@@ -1562,24 +1637,9 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       }
       entries.emplace_back(StackEntry{previous_fn_name, node->sourceRange()});
     }
-    format_stack_trace(out, entries);
-  }
-
-  void handleError(const ExceptionMessage& msg, bool is_jit_exception) {
-    std::ostringstream ss;
-    ss << "The following operation failed in the TorchScript interpreter.\n";
-    formatStackTrace(ss);
-    ss << "RuntimeError: " << msg << "\n";
-    if (future_) {
-      future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
-    } else if (is_jit_exception) {
-      throw JITException(ss.str());
-    } else {
-      throw std::runtime_error(ss.str());
-    }
+    return entries;
   }
 
- public:
   c10::intrusive_ptr<Future> getOrCreateFuture() {
     if (!future_) {
       future_ =
@@ -1611,6 +1671,15 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 };
 
+std::vector<StackEntry> currentCallstack() {
+  if (tls_int_state_ptr_) {
+    auto cs = tls_int_state_ptr_->callstack();
+    std::reverse(cs.begin(), cs.end());
+    return cs;
+  }
+  return std::vector<StackEntry>();
+}
+
 std::atomic<size_t> InterpreterStateImpl::Frame::num_frames;
 
 std::ostream& operator<<(std::ostream& out, const Code& code) {
@@ -1690,7 +1759,7 @@ InterpreterState::InterpreterState(
     : pImpl(std::move(pImpl_)) {}
 
 void InterpreterContinuation::operator()() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   auto prev_dist_id = DistAutogradContainer::currentContextId();
   DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_);
 #endif
@@ -1700,9 +1769,10 @@ void InterpreterContinuation::operator()() {
   } else {
     state.runAsync(stack);
   }
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   DistAutogradContainer::forceCurrentContextId(prev_dist_id);
 #endif
 }
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 3325e1213e91..740101cef23f 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -6,6 +6,7 @@
 #include <ATen/ThreadLocalState.h>
 #include <ATen/core/ivalue.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/frontend/source_range.h>
 
 namespace at {
 class Tensor;
@@ -126,5 +127,8 @@ struct InterpreterContinuation {
 TORCH_API at::TensorTypePtr tensorTypeInCurrentExecutionContext(
     const at::Tensor& t);
 
+// current (TLS) TorchScript interpreter callstack
+TORCH_API std::vector<StackEntry> currentCallstack();
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index f12c8186396e..2bd6a2b47ec9 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -224,6 +224,8 @@ bool printerHasSpecialCaseFor(Symbol sym) {
       c10::onnx::Shape, // only used in onnx
       prim::AutogradZero, // temporarily inserted by autograd
       prim::AutogradAnyNonZero, // temporarily inserted by autograd
+      prim::AutogradAllNonZero, // temporarily inserted by autograd
+      prim::AutogradAllZero, // temporarily inserted by autograd
       prim::AutogradAdd, // temporarily inserted by autograd
       prim::ConstantChunk, // optimization pass adds it
       prim::DifferentiableGraph, // optimization pass adds it,
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 5d63d78d4765..d8af0756e2e6 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -45,11 +45,11 @@ static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else
 static std::atomic<bool> executor_mode{true};
-static std::atomic<bool> profiling_mode{false};
+static std::atomic<bool> profiling_mode{true};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};
-static std::atomic<size_t> bailout_depth{1};
+static std::atomic<size_t> bailout_depth{20}; // NOLINT
 
 std::atomic<bool>& getProfilingMode() {
   return profiling_mode;
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 6ad16774789b..98c073668170 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -171,6 +171,8 @@ bool needsProfiledInputs(Node* n) {
     // specialize_autogradzero
     case prim::AutogradAdd:
     case prim::AutogradAnyNonZero:
+    case prim::AutogradAllNonZero:
+    case prim::AutogradAllZero:
     case prim::AutogradZero:
     // peephole
     case aten::dim:
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index ed3e2aceb19a..d32dd998a040 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/runtime/register_ops_utils.h>
 #include <torch/library.h>
@@ -75,59 +76,6 @@ c10::List<std::string> splitNoneSeparator(const std::string& string) {
   return splits;
 }
 
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("slice.str", TORCH_FN(stringSlice));
-  m.impl("strip", [](std::string string, const std::string& chars) {
-    auto rindex = string.find_last_not_of(chars);
-    if (rindex != std::string::npos) {
-      string = string.substr(0, rindex + 1);
-    } else {
-      string = "";
-    }
-    auto lindex = string.find_first_not_of(chars);
-    if (lindex != std::string::npos) {
-      string = string.substr(lindex, string.size());
-    } else {
-      string = "";
-    }
-    return string;
-  });
-  m.impl(
-      "split.str",
-      [](const std::string& string,
-         c10::optional<std::string> separator,
-         int64_t max) {
-        if (!separator.has_value()) {
-          // if separator is not specified,
-          // a different splitting algorithm is applied as Python
-          return splitNoneSeparator(string);
-          ;
-        }
-        if (separator.value().empty()) {
-          throw std::runtime_error("ValueError: empty separator");
-        }
-
-        std::string::size_type prev_pos = 0;
-        std::string::size_type pos = 0;
-        c10::List<std::string> splits;
-        auto count = 0;
-
-        while ((pos = string.find(separator.value(), pos)) !=
-               std::string::npos) {
-          count++;
-          if (max >= 0 && count > max) {
-            break;
-          } else {
-            splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
-          }
-          pos += separator.value().size();
-          prev_pos = pos;
-        }
-        splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
-        return splits;
-      });
-}
-
 RegisterOperators reg(
     {OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"),
@@ -150,6 +98,88 @@ RegisterOperators reg(
            return 0;
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"),
+         [](Stack* stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.cpu());
+         },
+         aliasAnalysisFromSchema()),
+     Operator(
+         prim::tolist,
+         // This operator has to be unschematized because the return type
+         // depends on the type hint and input. The implementation of this
+         // operator below is intended to be as close to the Python
+         // implementation in torch/csrc/utils/tensor_list.cpp as possible.
+         [](const Node * /*node*/) -> Operation {
+           return [](Stack* stack) {
+             int elem_ty_val;
+             int dim_val;
+             at::Tensor t;
+
+             pop(stack, elem_ty_val);
+             pop(stack, dim_val);
+             pop(stack, t);
+
+             // If the Tensor is not on the CPU, transfer it.
+             if (!t.device().is_cpu()) {
+               t = t.cpu();
+             }
+
+             // Rebuild the output type using elem_ty_val and dim_val. Start
+             // with the element type corresponding to elem_ty_val.
+             TypePtr out_ty;
+             if (elem_ty_val == 0) {
+               out_ty = IntType::get();
+             } else if (elem_ty_val == 1) {
+               out_ty = FloatType::get();
+             } else if (elem_ty_val == 2) {
+               out_ty = BoolType::get();
+             } else {
+               TORCH_CHECK(
+                   false,
+                   "Unsupported element type for tolist; only int, float and bool are supported");
+             }
+
+             // Check that type of the Tensor matches that of the annotation.
+             // Make an exception for the case in which the annotated type is
+             // float and the Tensor data type is also float; the elements will
+             // be casted to double later.
+             TORCH_CHECK(
+                 (out_ty == FloatType::get() && t.is_floating_point()) ||
+                     tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
+                 "Output annotation element type and runtime tensor element type must match for tolist()");
+
+             // Check that the dimension of the Tensor matches that of the
+             // annotation.
+             TORCH_CHECK(
+                 dim_val == t.dim(),
+                 "Output annotation list dimension and runtime tensor dimension must match for tolist()");
+
+             // Wrap out_ty in a ListType dim times.
+             for (int i = 0; i < dim_val; ++i) {
+               out_ty = ListType::create(out_ty);
+             }
+
+             int64_t dim = t.dim();
+             auto sizes = t.sizes();
+             auto strides = t.strides();
+             size_t element_size = t.element_size();
+             char* data = static_cast<char*>(t.data_ptr());
+             auto result = tensorToListRecursive(
+                 data,
+                 0,
+                 dim,
+                 out_ty,
+                 t.scalar_type(),
+                 sizes,
+                 strides,
+                 element_size);
+             push(stack, std::move(result));
+           };
+         },
+         aliasAnalysisSpecialCase()),
      // only used internally in range() translation
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
@@ -240,6 +270,59 @@ RegisterOperators reg(
            push(stack, (bool)d);
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"),
+         [](Stack* stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.item<int64_t>());
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"),
+         [](Stack* stack) {
+           bool b;
+           pop(stack, b);
+           push(stack, static_cast<int64_t>(b));
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"),
+         [](Stack* stack) {
+           double d;
+           pop(stack, d);
+           push(stack, static_cast<int64_t>(d));
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"),
+         [](Stack* stack) {
+           IValue scalar;
+           pop(stack, scalar);
+           if (scalar.isInt()) {
+             push(stack, std::move(scalar));
+           } else {
+             // toScalar() needed to avoid strict type check in IValue::toInt.
+             push(stack, static_cast<int64_t>(scalar.toScalar().toInt()));
+           }
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"),
+         [](Stack* stack) {
+           auto s = pop(stack).toString();
+           std::string::size_type sz;
+           int64_t val = static_cast<int64_t>(c10::stoll(s->string(), &sz));
+           if (sz == s->string().size()) {
+             push(stack, val);
+           } else {
+             std::stringstream error_str;
+             error_str << "invalid literal for int() "
+                       << "with base 10: '" << s->string() << "'";
+             throw std::runtime_error(error_str.str());
+           }
+         },
+         aliasAnalysisFromSchema()),
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"),
          [](Stack* stack) {
@@ -601,6 +684,7 @@ RegisterOperators reg(
          TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"),
          [](Stack* stack) { dequantize(*stack); },
          aliasAnalysisFromSchema()),
+     DEFINE_UNARY_OP(aten::log, std::log(a), float, float),
      DEFINE_STRING_OP(aten::add, a + b, str),
      DEFINE_COMPARISON_OP(aten::eq, a == b),
      DEFINE_COMPARISON_OP(aten::ne, a != b),
@@ -1238,6 +1322,573 @@ RegisterOperators reg_dict_ops({
     CREATE_DICT_OPS("Tensor"),
 });
 
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+// Convert an python index (which may be negative) into an index usable for a
+// C++ container
+int64_t normalizeIndex(int64_t idx, int64_t list_size) {
+  if (idx < 0) {
+    // Handle negative indexing
+    idx = list_size + idx;
+  }
+  return idx;
+}
+
+int64_t stringFindImpl(
+    std::string string,
+    std::string substr,
+    int64_t start,
+    int64_t end,
+    bool reverse = false) {
+  int64_t size = string.size();
+  if (start < 0) {
+    start = std::max(int64_t(0), int64_t(size + start));
+  }
+  if (end < 0) {
+    end = std::max(int64_t(0), int64_t(size + end + 1));
+  }
+  if (end > start) {
+    string = string.substr(start, end - start);
+  } else {
+    string = "";
+  }
+
+  int64_t result = -1;
+  if (string.size() >= substr.size()) {
+    auto pos = string.find(substr, 0);
+    if (reverse) {
+      auto rpos = pos;
+      do {
+        pos = rpos;
+        rpos = string.find(substr, pos + 1);
+      } while (rpos != std::string::npos);
+    }
+    if (pos != std::string::npos) {
+      result = pos + start;
+    }
+  }
+  return result;
+}
+
+// String Ops
+// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
+TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
+  m.impl(TORCH_SELECTIVE_NAME("aten::slice.str"), TORCH_FN(stringSlice));
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::strip"),
+      [](std::string string, const std::string& chars) {
+        auto rindex = string.find_last_not_of(chars);
+        if (rindex != std::string::npos) {
+          string = string.substr(0, rindex + 1);
+        } else {
+          string = "";
+        }
+        auto lindex = string.find_first_not_of(chars);
+        if (lindex != std::string::npos) {
+          string = string.substr(lindex, string.size());
+        } else {
+          string = "";
+        }
+        return string;
+      });
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::split.str"),
+      [](const std::string& string,
+         c10::optional<std::string> separator,
+         int64_t max) {
+        if (!separator.has_value()) {
+          // if separator is not specified,
+          // a different splitting algorithm is applied as Python
+          return splitNoneSeparator(string);
+          ;
+        }
+        if (separator.value().empty()) {
+          throw std::runtime_error("ValueError: empty separator");
+        }
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        c10::List<std::string> splits;
+        auto count = 0;
+
+        while ((pos = string.find(separator.value(), pos)) !=
+               std::string::npos) {
+          count++;
+          if (max >= 0 && count > max) {
+            break;
+          } else {
+            splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
+          }
+          pos += separator.value().size();
+          prev_pos = pos;
+        }
+        splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
+        return splits;
+      });
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::splitlines"),
+      [](std::string string, bool keepends) {
+        std::string delimiters =
+            "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029";
+        c10::List<std::string> splits;
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        while ((pos = string.find_first_of(delimiters, pos)) !=
+               std::string::npos) {
+          splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
+          if (keepends) {
+            splits.emplace_back(string.substr(pos, 1));
+          }
+          pos++;
+          prev_pos = pos;
+        }
+        if (prev_pos != string.size()) {
+          splits.emplace_back(
+              string.substr(prev_pos, string.size() - prev_pos));
+        }
+
+        return splits;
+      });
+
+  // upper and lower require there to be at least one alpha character,
+  // and ignore all other characters
+  m.impl(TORCH_SELECTIVE_NAME("aten::isupper"), [](std::string string) {
+    bool found_alpha = false;
+    bool is_upper = true;
+    for (size_t i = 0; i < string.size() && is_upper; ++i) {
+      char c = string[i];
+      found_alpha |= static_cast<bool>(::isalpha(c));
+      is_upper &= (!::isalpha(c) || ::isupper(c));
+    }
+    return found_alpha && is_upper;
+  });
+  m.impl(TORCH_SELECTIVE_NAME("aten::islower"), [](std::string string) {
+    bool found_alpha = false;
+    bool is_lower = true;
+    for (size_t i = 0; i < string.size() && is_lower; ++i) {
+      char c = string[i];
+      found_alpha |= static_cast<bool>(::isalpha(c));
+      is_lower &= (!::isalpha(c) || ::islower(c));
+    }
+    return found_alpha && is_lower;
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::capitalize"), [](std::string string) {
+    std::stringstream ss;
+    auto first_char = true;
+    for (char c : string) {
+      if (first_char) {
+        ss << static_cast<char>(::toupper(c));
+        first_char = false;
+      } else {
+        ss << static_cast<char>(::tolower(c));
+      }
+    }
+    return ss.str();
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::title"), [](std::string string) {
+    std::stringstream ss;
+    bool prev_is_nonalpha = true;
+    for (char c : string) {
+      if (prev_is_nonalpha) {
+        ss << static_cast<char>(::toupper(c));
+      } else {
+        ss << static_cast<char>(::tolower(c));
+      }
+      if (::isalpha(c)) {
+        prev_is_nonalpha = false;
+      } else {
+        prev_is_nonalpha = true;
+      }
+    }
+    return ss.str();
+  });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::center"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        if (string.size() > static_cast<std::string::size_type>(width)) {
+          return string;
+        }
+        std::stringstream ss;
+        std::string::size_type full_padding = width - string.size();
+        std::string::size_type l_pad = full_padding / 2;
+        std::string::size_type r_pad = (full_padding + 1) / 2;
+        if (width % 2) {
+          auto tmp = r_pad;
+          r_pad = l_pad;
+          l_pad = tmp;
+        }
+        for (std::string::size_type i = 0; i < l_pad; ++i) {
+          ss << fillchar;
+        }
+        ss << string;
+        for (std::string::size_type i = 0; i < r_pad; ++i) {
+          ss << fillchar;
+        }
+        return ss.str();
+      });
+
+  // Adapted from
+  // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::count"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start > size) {
+          return int64_t(0);
+        }
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        int64_t occurrences = 0;
+        std::string::size_type pos = start;
+        while ((pos = string.find(substr, pos)) != std::string::npos) {
+          if (pos < static_cast<std::string::size_type>(end)) {
+            ++occurrences;
+          } else {
+            break;
+          }
+          pos += substr.length();
+        }
+        return occurrences;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::endswith"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        string = string.substr(start, end - start);
+
+        auto result = false;
+        if (string.length() >= substr.length()) {
+          result = !string.compare(
+              string.length() - substr.length(), substr.length(), substr);
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::startswith"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        string = string.substr(start, end - start);
+
+        auto result = false;
+        if (string.length() >= substr.length()) {
+          result = !string.compare(0, substr.length(), substr);
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::expandtabs"),
+      [](std::string string, int64_t tabsize) {
+        std::stringstream ss;
+        size_t index = 0;
+        for (const auto& c : string) {
+          if (c != '\t') {
+            ss << c;
+            index++;
+          } else {
+            if (tabsize <= 0) {
+              continue;
+            }
+            do {
+              ss << ' ';
+              index++;
+            } while (index % tabsize);
+          }
+        }
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::find"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        return stringFindImpl(string, substr, start, end);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rfind"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        return stringFindImpl(string, substr, start, end, true);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::index.str"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        auto result = stringFindImpl(string, substr, start, end);
+        if (result < 0) {
+          throw std::runtime_error("ValueError: substring not found");
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rindex"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        auto result = stringFindImpl(string, substr, start, end, true);
+        if (result < 0) {
+          throw std::runtime_error("ValueError: substring not found");
+        }
+        return result;
+      });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::isidentifier"), [](std::string string) {
+    LOG(WARNING)
+        << "The isidentifier() implementation being used is from Python 2\n";
+    if (string.size() < 1) {
+      return false;
+    }
+    if (::isdigit(string[0])) {
+      return false;
+    }
+    auto result = std::all_of(
+        string.begin(), string.end(), [](char c) { return ::isalnum(c); });
+    return result;
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::istitle"), [](std::string string) {
+    auto result = false;
+
+    bool prev_is_alpha = false;
+    for (char c : string) {
+      if (prev_is_alpha) {
+        if (c != static_cast<char>(::tolower(c))) {
+          result = false;
+          break;
+        }
+      } else {
+        if (c != static_cast<char>(::toupper(c))) {
+          result = false;
+          break;
+        }
+        // Only true if there exists at least one alpha
+        if (::isalpha(c)) {
+          result = true;
+        }
+      }
+      if (::isalpha(c)) {
+        prev_is_alpha = true;
+      } else {
+        prev_is_alpha = false;
+      }
+    }
+    return result;
+  });
+
+  // Can't reuse DEFINE_STRING_IS_OP because "" is printable
+  m.impl(TORCH_SELECTIVE_NAME("aten::isprintable"), [](std::string string) {
+    auto result = std::all_of(string.begin(), string.end(), [](char c) {
+      return ::isalnum(c) || ::ispunct(c) || c == ' ';
+    });
+    return result;
+  });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::ljust"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        ss << string;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << fillchar;
+        }
+
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rjust"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << fillchar;
+        }
+        ss << string;
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::zfill"),
+      [](std::string string, int64_t width) {
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << '0';
+        }
+        ss << string;
+
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::lstrip"),
+      [](std::string string, std::string chars) {
+        auto index = string.find_first_not_of(chars);
+        if (index != std::string::npos) {
+          string = string.substr(index, string.size());
+        } else {
+          string = "";
+        }
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rstrip"),
+      [](std::string string, std::string chars) {
+        auto index = string.find_last_not_of(chars);
+        if (index != std::string::npos) {
+          string = string.substr(0, index + 1);
+        } else {
+          string = "";
+        }
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::replace"),
+      [](std::string string,
+         std::string old_str,
+         std::string new_str,
+         int64_t max) {
+        int64_t occurrences = 0;
+        std::string::size_type pos = 0;
+        while ((pos = string.find(old_str, pos)) != std::string::npos) {
+          if (max >= 0 && ++occurrences > max) {
+            break;
+          }
+          string = string.replace(pos, old_str.length(), new_str);
+          pos += new_str.length();
+        }
+
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::partition"),
+      [](std::string string, std::string separator) {
+        auto pos = string.find(separator, 0);
+        if (pos == std::string::npos) {
+          pos = string.size();
+          separator = "";
+        }
+        auto pre_partition = string.substr(0, pos);
+        auto post_partition =
+            string.substr(pos + separator.size(), string.size());
+
+        return std::make_tuple(pre_partition, separator, post_partition);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rpartition"),
+      [](std::string string, std::string separator) {
+        auto pos = string.find(separator, 0);
+        auto rpos = pos;
+        do {
+          pos = rpos;
+          rpos = string.find(separator, pos + 1);
+        } while (rpos != std::string::npos);
+
+        if (pos == std::string::npos) {
+          pos = 0;
+          separator = "";
+        }
+
+        auto pre_partition = string.substr(0, pos);
+        auto post_partition =
+            string.substr(pos + separator.size(), string.size());
+
+        return std::make_tuple(pre_partition, separator, post_partition);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rsplit"),
+      [](std::string string, std::string separator, int64_t max) {
+        std::reverse(separator.begin(), separator.end());
+        std::reverse(string.begin(), string.end());
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        c10::List<std::string> splits;
+        auto count = 0;
+        while ((pos = string.find(separator, pos)) != std::string::npos) {
+          count++;
+          if (max >= 0 && count > max) {
+            break;
+          } else {
+            auto substr = string.substr(prev_pos, pos - prev_pos);
+            std::reverse(substr.begin(), substr.end());
+            splits.emplace(splits.begin(), substr);
+          }
+          pos += separator.size();
+          prev_pos = pos;
+        }
+        auto substr = string.substr(prev_pos, string.size() - prev_pos);
+        std::reverse(substr.begin(), substr.end());
+        splits.emplace(splits.begin(), substr);
+        return splits;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::join"),
+      [](const std::string& string, const c10::List<std::string>& values) {
+        std::stringstream ss;
+        for (auto it = values.begin(); it != values.end(); ++it) {
+          ss << static_cast<std::string>(*it);
+          if (it != values.end() - 1) {
+            ss << string;
+          }
+        }
+        return ss.str();
+      });
+}
+
 } // namespace
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp b/torch/csrc/jit/runtime/register_prim_ops_c10.cpp
deleted file mode 100644
index b9e4e23c77b0..000000000000
--- a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/core/stack.h>
-#include <c10/util/string_utils.h>
-#include <torch/library.h>
-
-using Stack = std::vector<c10::IValue>;
-using at::Scalar;
-using at::Tensor;
-using c10::IValue;
-using torch::jit::drop;
-using torch::jit::pack;
-using torch::jit::peek;
-using torch::jit::pop;
-using torch::jit::push;
-
-// Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cpp
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("Int.Tensor", [](at::Tensor a) { return a.item<int64_t>(); });
-
-  m.impl("Int.bool", [](bool b) { return static_cast<int64_t>(b); });
-
-  m.impl("Int.float", [](double d) { return static_cast<int64_t>(d); });
-
-  m.impl("Int.Scalar", [](Scalar scalar) {
-    return static_cast<int64_t>(scalar.toInt());
-  });
-
-  m.impl("Int.str", [](const std::string& str) {
-    std::string::size_type sz;
-    int64_t val = static_cast<int64_t>(c10::stoll(str, &sz));
-    if (sz != str.size()) {
-      std::stringstream error_str;
-      error_str << "invalid literal for int() "
-                << "with base 10: '" << str << "'";
-      throw std::runtime_error(error_str.str());
-    }
-    return val;
-  });
-}
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index e3b0fa1e88c3..0a1fb91efc62 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -301,14 +301,6 @@ RegisterOperators reg(
            push(stack, a.layout());
          },
          aliasAnalysisFromSchema()),
-     Operator(
-         "aten::cpu(Tensor(a) self) -> Tensor(a|b)",
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.cpu());
-         },
-         aliasAnalysisFromSchema()),
      Operator(
          "prim::index(Device self) -> int?",
          [](Stack* stack) {
@@ -442,6 +434,38 @@ RegisterOperators reg(
            stack->emplace_back(result);
          },
          aliasAnalysisFromSchema()),
+     Operator(
+         "prim::AutogradAllZero(...) -> bool",
+         [](Stack* stack) {
+           auto num_inputs = pop(stack).toInt();
+           bool result = true;
+           for (const IValue& v : last(stack, num_inputs)) {
+             TORCH_INTERNAL_ASSERT(v.isTensor());
+             if (v.toTensor().defined()) {
+               result = false;
+               break;
+             }
+           }
+           drop(stack, num_inputs);
+           stack->emplace_back(result);
+         },
+         aliasAnalysisFromSchema()),
+     Operator(
+         "prim::AutogradAllNonZero(...) -> bool",
+         [](Stack* stack) {
+           auto num_inputs = pop(stack).toInt();
+           bool result = true;
+           for (const IValue& v : last(stack, num_inputs)) {
+             TORCH_INTERNAL_ASSERT(v.isTensor());
+             if (!v.toTensor().defined()) {
+               result = false;
+               break;
+             }
+           }
+           drop(stack, num_inputs);
+           stack->emplace_back(result);
+         },
+         aliasAnalysisFromSchema()),
      Operator(
          "prim::AutogradAdd(Any a, Any b) -> Any",
          [](Stack* stack) {
@@ -485,80 +509,6 @@ RegisterOperators reg(
            }
          },
          aliasAnalysisFromSchema()),
-     Operator(
-         prim::tolist,
-         // This operator has to be unschematized because the return type
-         // depends on the type hint and input. The implementation of this
-         // operator below is intended to be as close to the Python
-         // implementation in torch/csrc/utils/tensor_list.cpp as possible.
-         [](const Node* node) -> Operation {
-           return [](Stack* stack) {
-             int elem_ty_val;
-             int dim_val;
-             at::Tensor t;
-
-             pop(stack, elem_ty_val);
-             pop(stack, dim_val);
-             pop(stack, t);
-
-             // If the Tensor is not on the CPU, transfer it.
-             if (!t.device().is_cpu()) {
-               t = t.cpu();
-             }
-
-             // Rebuild the output type using elem_ty_val and dim_val. Start
-             // with the element type corresponding to elem_ty_val.
-             TypePtr out_ty;
-             if (elem_ty_val == 0) {
-               out_ty = IntType::get();
-             } else if (elem_ty_val == 1) {
-               out_ty = FloatType::get();
-             } else if (elem_ty_val == 2) {
-               out_ty = BoolType::get();
-             } else {
-               TORCH_CHECK(
-                   false,
-                   "Unsupported element type for tolist; only int, float and bool are supported");
-             }
-
-             // Check that type of the Tensor matches that of the annotation.
-             // Make an exception for the case in which the annotated type is
-             // float and the Tensor data type is also float; the elements will
-             // be casted to double later.
-             TORCH_CHECK(
-                 (out_ty == FloatType::get() && t.is_floating_point()) ||
-                     tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
-                 "Output annotation element type and runtime tensor element type must match for tolist()");
-
-             // Check that the dimension of the Tensor matches that of the
-             // annotation.
-             TORCH_CHECK(
-                 dim_val == t.dim(),
-                 "Output annotation list dimension and runtime tensor dimension must match for tolist()");
-
-             // Wrap out_ty in a ListType dim times.
-             for (int i = 0; i < dim_val; ++i) {
-               out_ty = ListType::create(out_ty);
-             }
-
-             int64_t dim = t.dim();
-             auto sizes = t.sizes();
-             auto strides = t.strides();
-             size_t element_size = t.element_size();
-             char* data = static_cast<char*>(t.data_ptr());
-             auto result = tensorToListRecursive(
-                 data,
-                 0,
-                 dim,
-                 out_ty,
-                 t.scalar_type(),
-                 sizes,
-                 strides,
-                 element_size);
-             push(stack, std::move(result));
-           };
-         },
-         aliasAnalysisSpecialCase()),
      Operator(
          prim::ConstantChunk,
          [](const Node* node) -> Operation {
@@ -915,7 +865,6 @@ RegisterOperators reg2({
     DEFINE_INT_OP(aten::__rshift__, a >> b),
 
     DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float),
-    DEFINE_UNARY_OP(aten::log, std::log(a), float, float),
     DEFINE_GENERIC_BINARY_OP(aten::log, std::log(a) / std::log(b), float),
     DEFINE_INT_FLOAT_OP(aten::log, std::log(a) / std::log(b), float),
     DEFINE_SCALAR_SCALAR_BINARY_OP(
diff --git a/torch/csrc/jit/runtime/register_string_ops.cpp b/torch/csrc/jit/runtime/register_string_ops.cpp
deleted file mode 100644
index 244893b10393..000000000000
--- a/torch/csrc/jit/runtime/register_string_ops.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-#include <torch/csrc/jit/runtime/custom_operator.h>
-#include <torch/csrc/jit/runtime/operator.h>
-#include <torch/library.h>
-
-namespace torch {
-namespace jit {
-namespace {
-
-c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
-}
-
-// Convert an python index (which may be negative) into an index usable for a
-// C++ container
-int64_t normalizeIndex(int64_t idx, int64_t list_size) {
-  if (idx < 0) {
-    // Handle negative indexing
-    idx = list_size + idx;
-  }
-  return idx;
-}
-
-int64_t stringFindImpl(
-    std::string string,
-    std::string substr,
-    int64_t start,
-    int64_t end,
-    bool reverse = false) {
-  int64_t size = string.size();
-  if (start < 0) {
-    start = std::max(int64_t(0), int64_t(size + start));
-  }
-  if (end < 0) {
-    end = std::max(int64_t(0), int64_t(size + end + 1));
-  }
-  if (end > start) {
-    string = string.substr(start, end - start);
-  } else {
-    string = "";
-  }
-
-  int64_t result = -1;
-  if (string.size() >= substr.size()) {
-    auto pos = string.find(substr, 0);
-    if (reverse) {
-      auto rpos = pos;
-      do {
-        pos = rpos;
-        rpos = string.find(substr, pos + 1);
-      } while (rpos != std::string::npos);
-    }
-    if (pos != std::string::npos) {
-      result = pos + start;
-    }
-  }
-  return result;
-}
-
-// String Ops
-// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("splitlines", [](std::string string, bool keepends) {
-    std::string delimiters = "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029";
-    c10::List<std::string> splits;
-
-    std::string::size_type prev_pos = 0;
-    std::string::size_type pos = 0;
-    while ((pos = string.find_first_of(delimiters, pos)) != std::string::npos) {
-      splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
-      if (keepends) {
-        splits.emplace_back(string.substr(pos, 1));
-      }
-      pos++;
-      prev_pos = pos;
-    }
-    if (prev_pos != string.size()) {
-      splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
-    }
-
-    return splits;
-  });
-
-  // upper and lower require there to be at least one alpha character,
-  // and ignore all other characters
-  m.impl("isupper", [](std::string string) {
-    bool found_alpha = false;
-    bool is_upper = true;
-    for (size_t i = 0; i < string.size() && is_upper; ++i) {
-      char c = string[i];
-      found_alpha |= static_cast<bool>(::isalpha(c));
-      is_upper &= (!::isalpha(c) || ::isupper(c));
-    }
-    return found_alpha && is_upper;
-  });
-  m.impl("islower", [](std::string string) {
-    bool found_alpha = false;
-    bool is_lower = true;
-    for (size_t i = 0; i < string.size() && is_lower; ++i) {
-      char c = string[i];
-      found_alpha |= static_cast<bool>(::isalpha(c));
-      is_lower &= (!::isalpha(c) || ::islower(c));
-    }
-    return found_alpha && is_lower;
-  });
-
-  m.impl("capitalize", [](std::string string) {
-    std::stringstream ss;
-    auto first_char = true;
-    for (char c : string) {
-      if (first_char) {
-        ss << static_cast<char>(::toupper(c));
-        first_char = false;
-      } else {
-        ss << static_cast<char>(::tolower(c));
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl("title", [](std::string string) {
-    std::stringstream ss;
-    bool prev_is_nonalpha = true;
-    for (char c : string) {
-      if (prev_is_nonalpha) {
-        ss << static_cast<char>(::toupper(c));
-      } else {
-        ss << static_cast<char>(::tolower(c));
-      }
-      if (::isalpha(c)) {
-        prev_is_nonalpha = false;
-      } else {
-        prev_is_nonalpha = true;
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl("center", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    if (string.size() > static_cast<std::string::size_type>(width)) {
-      return string;
-    }
-    std::stringstream ss;
-    std::string::size_type full_padding = width - string.size();
-    std::string::size_type l_pad = full_padding / 2;
-    std::string::size_type r_pad = (full_padding + 1) / 2;
-    if (width % 2) {
-      auto tmp = r_pad;
-      r_pad = l_pad;
-      l_pad = tmp;
-    }
-    for (std::string::size_type i = 0; i < l_pad; ++i) {
-      ss << fillchar;
-    }
-    ss << string;
-    for (std::string::size_type i = 0; i < r_pad; ++i) {
-      ss << fillchar;
-    }
-    return ss.str();
-  });
-
-  // Adapted from
-  // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string
-  m.impl(
-      "count",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start > size) {
-          return int64_t(0);
-        }
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        int64_t occurrences = 0;
-        std::string::size_type pos = start;
-        while ((pos = string.find(substr, pos)) != std::string::npos) {
-          if (pos < static_cast<std::string::size_type>(end)) {
-            ++occurrences;
-          } else {
-            break;
-          }
-          pos += substr.length();
-        }
-        return occurrences;
-      });
-
-  m.impl(
-      "endswith",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        string = string.substr(start, end - start);
-
-        auto result = false;
-        if (string.length() >= substr.length()) {
-          result = !string.compare(
-              string.length() - substr.length(), substr.length(), substr);
-        }
-        return result;
-      });
-
-  m.impl(
-      "startswith",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        string = string.substr(start, end - start);
-
-        auto result = false;
-        if (string.length() >= substr.length()) {
-          result = !string.compare(0, substr.length(), substr);
-        }
-        return result;
-      });
-
-  m.impl("expandtabs", [](std::string string, int64_t tabsize) {
-    std::stringstream ss;
-    size_t index = 0;
-    for (const auto& c : string) {
-      if (c != '\t') {
-        ss << c;
-        index++;
-      } else {
-        if (tabsize <= 0) {
-          continue;
-        }
-        do {
-          ss << ' ';
-          index++;
-        } while (index % tabsize);
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl(
-      "find",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        return stringFindImpl(string, substr, start, end);
-      });
-
-  m.impl(
-      "rfind",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        return stringFindImpl(string, substr, start, end, true);
-      });
-
-  m.impl(
-      "index.str",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        auto result = stringFindImpl(string, substr, start, end);
-        if (result < 0) {
-          throw std::runtime_error("ValueError: substring not found");
-        }
-        return result;
-      });
-
-  m.impl(
-      "rindex",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        auto result = stringFindImpl(string, substr, start, end, true);
-        if (result < 0) {
-          throw std::runtime_error("ValueError: substring not found");
-        }
-        return result;
-      });
-
-  m.impl("isidentifier", [](std::string string) {
-    LOG(WARNING)
-        << "The isidentifier() implementation being used is from Python 2\n";
-    if (string.size() < 1) {
-      return false;
-    }
-    if (::isdigit(string[0])) {
-      return false;
-    }
-    auto result = std::all_of(
-        string.begin(), string.end(), [](char c) { return ::isalnum(c); });
-    return result;
-  });
-
-  m.impl("istitle", [](std::string string) {
-    auto result = false;
-
-    bool prev_is_alpha = false;
-    for (char c : string) {
-      if (prev_is_alpha) {
-        if (c != static_cast<char>(::tolower(c))) {
-          result = false;
-          break;
-        }
-      } else {
-        if (c != static_cast<char>(::toupper(c))) {
-          result = false;
-          break;
-        }
-        // Only true if there exists at least one alpha
-        if (::isalpha(c)) {
-          result = true;
-        }
-      }
-      if (::isalpha(c)) {
-        prev_is_alpha = true;
-      } else {
-        prev_is_alpha = false;
-      }
-    }
-    return result;
-  });
-
-  // Can't reuse DEFINE_STRING_IS_OP because "" is printable
-  m.impl("isprintable", [](std::string string) {
-    auto result = std::all_of(string.begin(), string.end(), [](char c) {
-      return ::isalnum(c) || ::ispunct(c) || c == ' ';
-    });
-    return result;
-  });
-
-  m.impl("ljust", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    ss << string;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << fillchar;
-    }
-
-    return ss.str();
-  });
-
-  m.impl("rjust", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << fillchar;
-    }
-    ss << string;
-    return ss.str();
-  });
-
-  m.impl("zfill", [](std::string string, int64_t width) {
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << '0';
-    }
-    ss << string;
-
-    return ss.str();
-  });
-
-  m.impl("lstrip", [](std::string string, std::string chars) {
-    auto index = string.find_first_not_of(chars);
-    if (index != std::string::npos) {
-      string = string.substr(index, string.size());
-    } else {
-      string = "";
-    }
-    return string;
-  });
-
-  m.impl("rstrip", [](std::string string, std::string chars) {
-    auto index = string.find_last_not_of(chars);
-    if (index != std::string::npos) {
-      string = string.substr(0, index + 1);
-    } else {
-      string = "";
-    }
-    return string;
-  });
-
-  m.impl(
-      "replace",
-      [](std::string string,
-         std::string old_str,
-         std::string new_str,
-         int64_t max) {
-        int64_t occurrences = 0;
-        std::string::size_type pos = 0;
-        while ((pos = string.find(old_str, pos)) != std::string::npos) {
-          if (max >= 0 && ++occurrences > max) {
-            break;
-          }
-          string = string.replace(pos, old_str.length(), new_str);
-          pos += new_str.length();
-        }
-
-        return string;
-      });
-
-  m.impl("partition", [](std::string string, std::string separator) {
-    auto pos = string.find(separator, 0);
-    if (pos == std::string::npos) {
-      pos = string.size();
-      separator = "";
-    }
-    auto pre_partition = string.substr(0, pos);
-    auto post_partition = string.substr(pos + separator.size(), string.size());
-
-    return std::make_tuple(pre_partition, separator, post_partition);
-  });
-
-  m.impl("rpartition", [](std::string string, std::string separator) {
-    auto pos = string.find(separator, 0);
-    auto rpos = pos;
-    do {
-      pos = rpos;
-      rpos = string.find(separator, pos + 1);
-    } while (rpos != std::string::npos);
-
-    if (pos == std::string::npos) {
-      pos = 0;
-      separator = "";
-    }
-
-    auto pre_partition = string.substr(0, pos);
-    auto post_partition = string.substr(pos + separator.size(), string.size());
-
-    return std::make_tuple(pre_partition, separator, post_partition);
-  });
-
-  m.impl("rsplit", [](std::string string, std::string separator, int64_t max) {
-    std::reverse(separator.begin(), separator.end());
-    std::reverse(string.begin(), string.end());
-
-    std::string::size_type prev_pos = 0;
-    std::string::size_type pos = 0;
-    c10::List<std::string> splits;
-    auto count = 0;
-    while ((pos = string.find(separator, pos)) != std::string::npos) {
-      count++;
-      if (max >= 0 && count > max) {
-        break;
-      } else {
-        auto substr = string.substr(prev_pos, pos - prev_pos);
-        std::reverse(substr.begin(), substr.end());
-        splits.emplace(splits.begin(), substr);
-      }
-      pos += separator.size();
-      prev_pos = pos;
-    }
-    auto substr = string.substr(prev_pos, string.size() - prev_pos);
-    std::reverse(substr.begin(), substr.end());
-    splits.emplace(splits.begin(), substr);
-    return splits;
-  });
-
-  m.impl(
-      "join",
-      [](const std::string& string, const c10::List<std::string>& values) {
-        std::stringstream ss;
-        for (auto it = values.begin(); it != values.end(); ++it) {
-          ss << static_cast<std::string>(*it);
-          if (it != values.end() - 1) {
-            ss << string;
-          }
-        }
-        return ss.str();
-      });
-}
-
-} // namespace
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 5b3c1e029a90..e63c0be66454 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1,109 +1,58 @@
 #include <torch/csrc/jit/runtime/static/impl.h>
-#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/core/interned_strings.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/runtime/static/ops.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch {
 namespace jit {
 
-using c10::DispatchKey;
-using c10::RegisterOperators;
-
-static auto reg =
-    RegisterOperators()
-        .op("static::add(Tensor a, Tensor b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, at::Tensor b) -> at::Tensor { return a + b; }))
-        .op("static::mul.a(Tensor a, Tensor b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, at::Tensor b) -> at::Tensor { return a * b; }))
-        .op("static::mul.b(Tensor a, int b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, int64_t b) -> at::Tensor { return a * b; }));
-
-#define SUPPORTED_OPS(F) \
-  F(aten::__getitem__)   \
-  F(aten::add)           \
-  F(aten::addmm)         \
-  F(aten::bmm)           \
-  F(aten::cat)           \
-  F(aten::clamp)         \
-  F(aten::contiguous)    \
-  F(aten::div)           \
-  F(aten::flatten)       \
-  F(aten::index_put_)    \
-  F(aten::isnan)         \
-  F(aten::matmul)        \
-  F(aten::mul)           \
-  F(aten::permute)       \
-  F(aten::relu)          \
-  F(aten::sigmoid)       \
-  F(aten::size)          \
-  F(aten::softmax)       \
-  F(aten::t)             \
-  F(aten::to)            \
-  F(aten::transpose)     \
-  F(aten::view)          \
-  F(prim::Constant)      \
-  F(prim::ListConstruct) \
-  F(prim::TupleConstruct)
+std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g) {
+  Inline(*g);
+  ConstantPropagation(g);
+  Canonicalize(g);
+  ConstantPropagation(g);
+  RemoveTensorMutation(g);
+  ConstantPropagation(g);
 
-StaticRuntime::StaticRuntime(const torch::jit::Module& m)
-    : module_(m.copy()), graph_(nullptr) {
-  module_.eval();
-  module_ = freeze_module(module_);
-  graph_ = module_.get_method("forward").graph();
-
-  Inline(*graph_);
-  ConstantPropagation(graph_);
-  Canonicalize(graph_);
-  ConstantPropagation(graph_);
-  RemoveTensorMutation(graph_);
-  ConstantPropagation(graph_);
-
-  for (auto n : graph_->nodes()) {
+  for (auto n : g->nodes()) {
     if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
       throw std::runtime_error("Cannot accelerate unfrozen graphs");
     }
-    bool supported = false;
-#define X(_)                                          \
-  if (n->kind() == c10::Symbol::fromQualString(#_)) { \
-    supported = true;                                 \
-  }
-    SUPPORTED_OPS(X)
-#undef X
-    if (!supported) {
-      throw std::runtime_error(
-          std::string("Unsupported operation: ") + n->kind().toQualString());
-    }
   }
 
-  SubgraphRewriter sr;
-  sr.RegisterRewritePattern(
-      R"IR(
-  graph(%x, %w, %s):
-    %r = aten::add(%x, %w, %s)
-    return (%r))IR",
-      R"IR(
-  graph(%x, %w, %s):
-    %y = static::add(%x, %w)
-    %r = static::mul(%y, %s)
-    return (%r))IR");
-  sr.runOnGraph(graph_);
-
   // remove unused input 0 from graph
-  if (graph_->inputs().at(0)->type()->is_module()) {
-    if (!graph_->inputs().at(0)->hasUses()) {
-      graph_->eraseInput(0);
-    }
+  if (g->inputs().at(0)->type()->is_module()) {
+    TORCH_INTERNAL_ASSERT(!g->inputs().at(0)->hasUses());
+    g->eraseInput(0);
   }
 
+  return g;
+}
+
+std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    const torch::jit::Module& m) {
+  auto module = m.copy();
+  module.eval();
+  module = freeze_module(module);
+  auto g = module.get_method("forward").graph();
+  return PrepareForStaticRuntime(g);
+}
+
+StaticRuntime::StaticRuntime(std::shared_ptr<torch::jit::Graph> g)
+    : StaticRuntime(g, c10::nullopt) {}
+
+StaticRuntime::StaticRuntime(const torch::jit::Module& m)
+    : StaticRuntime(PrepareForStaticRuntime(m), m) {}
+
+StaticRuntime::StaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g,
+    c10::optional<torch::jit::Module> m)
+    : graph_(g) {
   // fill workspace_ with constants
   for (Node* node : graph_->nodes()) {
     if (node->kind() == prim::Constant) {
@@ -113,22 +62,25 @@ StaticRuntime::StaticRuntime(const torch::jit::Module& m)
       nodes_.emplace_back(node);
     }
   }
+  if (m) {
+    Method method = m->get_method("forward");
+    const c10::FunctionSchema& schema = method.function().getSchema();
+
+    // remove "self" from function schema
+    TORCH_INTERNAL_ASSERT(
+        schema.arguments().size() >= 1 &&
+        schema.arguments()[0].name() == "self");
+    std::vector<Argument> args(
+        {schema.arguments().begin() + 1, schema.arguments().end()});
+    schema_ =
+        std::make_unique<c10::FunctionSchema>(schema.cloneWithArguments(args));
+  }
 }
 
 std::vector<at::Tensor> StaticRuntime::run(
-    const std::vector<at::Tensor>& inps) {
-  // Container for inputs, outputs, and activations (excluding parameters)
-
-  int start = 0;
-  if (graph_->inputs().size() != inps.size()) {
-    start = 1;
-    CHECK_EQ(graph_->inputs().size(), inps.size() + 1);
-    CHECK((graph_->inputs().at(0)->type()->is_module()));
-    workspace_[graph_->inputs()[0]] = module_._ivalue();
-  }
-
+    const std::vector<at::Tensor>& inps) const {
   for (size_t i = 0; i < inps.size(); i++) {
-    workspace_[graph_->inputs()[i + start]] = inps[i];
+    workspace_[graph_->inputs()[i]] = inps[i];
   }
 
   for (const auto& n : nodes_) {
@@ -150,17 +102,44 @@ std::vector<at::Tensor> StaticRuntime::run(
   return out;
 }
 
+c10::IValue StaticRuntime::run(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) const {
+  std::vector<IValue> stack(args);
+  if (!kwargs.empty()) {
+    // This is not ideal
+    TORCH_INTERNAL_ASSERT(
+        schema_ != nullptr,
+        "Schema is not available. Consider creating the Static Runtime "
+        "with StaticRuntime(const torch::jit::Module& m) instead.");
+    schema_->checkAndNormalizeInputs(stack, kwargs);
+  }
+  for (size_t i = 0; i < stack.size(); i++) {
+    workspace_[graph_->inputs()[i]] = stack[i];
+  }
+
+  for (const auto& n : nodes_) {
+    n.run(workspace_);
+  }
+
+  return workspace_[graph_->outputs().at(0)];
+}
+
 ProcessedNode::ProcessedNode(Node* node) : node_(node) {
   if (node->kind() != prim::ListConstruct &&
-      node->kind() != prim::TupleConstruct) {
+      node->kind() != prim::TupleConstruct &&
+      node->kind() != prim::ListUnpack) {
     const Operator& op = node->getOperator();
     CHECK(op.hasOperation());
     op_ = op.getOperation(node);
   }
+  if (canRunOutOfPlace(node)) {
+    fn_ = getOutOfPlaceOperation(node);
+  }
 }
 
 void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
-  if (use_stack_) {
+  if (!fn_) {
     std::vector<IValue> stack;
     const size_t size = node_->inputs().size();
     stack.reserve(size);
@@ -174,7 +153,7 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
       stack.emplace_back(f->second);
     }
     if (op_) {
-      (*op_)(&stack);
+      op_->operator()(&stack);
     } else {
       if (node_->kind() == prim::ListConstruct) {
         listConstruct(
@@ -192,6 +171,9 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
         } else {
           tupleConstruct(stack, node_->inputs().size());
         }
+      } else if (node_->kind() == prim::ListUnpack) {
+        size_t num_outputs = node_->outputs().size();
+        listUnpack(stack, num_outputs);
       } else {
         TORCH_CHECK(0, "Unhandled operation!", node_->kind().toQualString());
       }
@@ -201,7 +183,7 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
       workspace[node_->outputs()[i]] = stack[i];
     }
   } else {
-    TORCH_CHECK(0, "Non-stack execution not yet implemented");
+    fn_->operator()(workspace);
   }
 }
 
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 2274d2883fb5..270251bc265d 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -14,15 +14,25 @@
 namespace torch {
 namespace jit {
 
+TORCH_API std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g);
+TORCH_API std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    const torch::jit::Module& m);
+
 class ProcessedNode;
 class TORCH_API StaticRuntime {
  public:
-  explicit StaticRuntime(std::shared_ptr<torch::jit::Graph> g)
-      : graph_(std::move(g)) {}
+  // g is the optimized graph produced by PrepareForStaticRuntime
+  explicit StaticRuntime(std::shared_ptr<torch::jit::Graph> g);
 
+  // m is unoptimized
   explicit StaticRuntime(const torch::jit::Module& m);
 
-  std::vector<at::Tensor> run(const std::vector<at::Tensor>& inps);
+  std::vector<at::Tensor> run(const std::vector<at::Tensor>& inps) const;
+
+  c10::IValue run(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs) const;
 
 #ifdef FBCODE_CAFFE2
   using ConstantMap = folly::F14FastMap<Value*, IValue>;
@@ -31,12 +41,17 @@ class TORCH_API StaticRuntime {
 #endif
 
  private:
-  torch::jit::Module module_;
+  explicit StaticRuntime(
+      std::shared_ptr<torch::jit::Graph> g, // optimized graph
+      c10::optional<torch::jit::Module> m);
+
   std::shared_ptr<torch::jit::Graph> graph_;
 
+  std::unique_ptr<c10::FunctionSchema> schema_{nullptr};
+
   // Static runtime states
   // Value table (including weights)
-  ConstantMap workspace_;
+  mutable ConstantMap workspace_;
 
   // The nodes we need to run
   std::vector<ProcessedNode> nodes_;
@@ -53,8 +68,7 @@ class ProcessedNode {
  private:
   Node* node_;
   c10::optional<Operation> op_;
-  // if false, we have an optimized version
-  bool use_stack_ = true;
+  c10::optional<std::function<void(StaticRuntime::ConstantMap&)>> fn_;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp
index d57242d6b68c..f55ca1a2801b 100644
--- a/torch/csrc/jit/runtime/static/init.cpp
+++ b/torch/csrc/jit/runtime/static/init.cpp
@@ -6,14 +6,18 @@ namespace jit {
 
 void initStaticRuntimeBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
-  py::class_<StaticRuntime>(m, "StaticRuntime").def("run", &StaticRuntime::run);
+  py::class_<StaticRuntime>(m, "StaticRuntime")
+      .def(
+          "run",
+          py::overload_cast<const std::vector<at::Tensor>&>(
+              &StaticRuntime::run, py::const_));
   m.def(
        "_jit_to_static_runtime",
        [](const std::shared_ptr<torch::jit::Graph>& g) {
-         return StaticRuntime(g);
+         return StaticRuntime(PrepareForStaticRuntime(g));
        })
       .def("_jit_to_static_runtime", [](const torch::jit::Module& m) {
-        return StaticRuntime(m);
+        return StaticRuntime(PrepareForStaticRuntime(m));
       });
 }
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
new file mode 100644
index 000000000000..19c783c8a996
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -0,0 +1,152 @@
+#include <torch/csrc/jit/runtime/static/ops.h>
+#include <ATen/NativeFunctions.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+bool canRunOutOfPlace(Node* n) {
+  auto str = std::string(n->kind().toQualString());
+  if ((str == "aten::add") || (str == "aten::mul") || (str == "aten::addmm") ||
+      (str == "aten::bmm") || (str == "aten::sigmoid") ||
+      (str == "aten::cat") || (str == "aten::transpose") ||
+      (str == "aten::flatten")) {
+    return true;
+  }
+  return false;
+}
+
+std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
+    Node* n) {
+  auto create_empty_from = [](const at::Tensor& t) {
+    return at::empty({0}, t.options());
+  };
+
+  if (n->kind() == c10::Symbol::fromQualString("aten::add")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      auto in2_s = ws.at(in2).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::add_out(out_t, in0_t, in1_t, in2_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::mul")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::mul_out(out_t, in0_t, in1_t);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::addmm")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    auto in3 = n->inputs().at(3);
+    auto in4 = n->inputs().at(4);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      auto in2_t = ws.at(in2).toTensor();
+      auto in3_s = ws.at(in3).toScalar();
+      auto in4_s = ws.at(in3).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::clamp")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_s = ws.at(in1).toScalar();
+      auto in2_s = ws.at(in2).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::clamp_out(out_t, in0_t, in1_s, in2_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::bmm")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::bmm_out_cpu(out_t, in0_t, in1_t);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::cat")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_tl = ws.at(in0).toTensorVector();
+      auto in1_i = ws.at(in1).toInt();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_tl[0]));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::_cat_out_cpu(out_t, in0_tl, in1_i);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::sigmoid")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::sigmoid_out(out_t, in0_t);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_i = ws.at(in1).toInt();
+      auto in2_i = ws.at(in2).toInt();
+      ws[out] = at::native::transpose(in0_t, in1_i, in2_i);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_i = ws.at(in1).toInt();
+      auto in2_i = ws.at(in2).toInt();
+      ws[out] = at::native::flatten(in0_t, in1_i, in2_i);
+    };
+  }
+
+  return [](StaticRuntime::ConstantMap&) { TORCH_CHECK(0); };
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
new file mode 100644
index 000000000000..e00416d786d3
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace torch {
+namespace jit {
+
+bool canRunOutOfPlace(Node* n);
+std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
+    Node* n);
+
+#define SUPPORTED_OPS(F) \
+  F(aten::__getitem__)   \
+  F(aten::add)           \
+  F(aten::addmm)         \
+  F(aten::bmm)           \
+  F(aten::cat)           \
+  F(aten::clamp)         \
+  F(aten::contiguous)    \
+  F(aten::div)           \
+  F(aten::flatten)       \
+  F(aten::index_put_)    \
+  F(aten::isnan)         \
+  F(aten::matmul)        \
+  F(aten::mul)           \
+  F(aten::permute)       \
+  F(aten::relu)          \
+  F(aten::sigmoid)       \
+  F(aten::size)          \
+  F(aten::softmax)       \
+  F(aten::t)             \
+  F(aten::to)            \
+  F(aten::transpose)     \
+  F(aten::view)          \
+  F(prim::Constant)      \
+  F(prim::ListConstruct) \
+  F(prim::TupleConstruct)
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index c44c00a88727..ee1eba98483b 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -178,6 +178,10 @@ class EncoderBase {
     return model_proto_;
   }
 
+  SymbolDimMap get_symbol_dim_param_map() {
+    return symbol_dim_map_;
+  }
+
  protected:
   // Using std::map instead of std::unordered_map for initializers
   // in EncodeGraph constructor so that the order in which initializers
@@ -243,6 +247,7 @@ class EncoderBase {
       const bool use_external_data_format = false,
       const std::string& onnx_file_path = std::string());
 
+  SymbolDimMap symbol_dim_map_;
   onnx::ModelProto model_proto_;
   size_t num_blocks_;
   size_t num_op_nodes_;
@@ -316,33 +321,38 @@ void EncoderBase::EncodeValueInfo(
         std::unordered_map<int64_t, std::string>>& dynamic_axes) {
   std::string name = n->debugName();
   v->set_name(name);
-  auto tensorTypeToONNXType = [&dynamic_axes, &name](
+  auto tensorTypeToONNXType = [&dynamic_axes, &name, this](
                                   TensorTypePtr t,
                                   onnx::TypeProto_Tensor* tensor_type) {
-    if (t->sizes().isComplete()) {
-      // onnx::TypeProto* onnx_type = v->mutable_type();
-      // onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
+    if (t->dim()) {
       onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
-      std::vector<std::int64_t> sizes = t->sizes().concrete_sizes().value();
+      auto sizes = t->symbolic_sizes().sizes().value();
       for (size_t i = 0; i < sizes.size(); i++) {
         shape->add_dim();
         if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
             (dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
           shape->mutable_dim(i)->set_dim_param(dynamic_axes.at(name).at(i));
+          if (!sizes[i].is_static()) {
+            symbol_dim_map_[sizes[i]] = dynamic_axes.at(name).at(i);
+          }
+        } else if (sizes[i].is_static()) {
+          shape->mutable_dim(i)->set_dim_value(sizes[i].static_size());
         } else {
-          shape->mutable_dim(i)->set_dim_value(sizes[i]);
+          if (symbol_dim_map_.find(sizes[i]) == symbol_dim_map_.end()) {
+            symbol_dim_map_[sizes[i]] = name + "_" + std::to_string(i);
+          }
+          shape->mutable_dim(i)->set_dim_param(symbol_dim_map_[sizes[i]]);
         }
       }
     }
     if (t->scalarType()) {
-      // onnx::TypeProto* onnx_type = v->mutable_type();
-      // onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
       tensor_type->set_elem_type(ATenTypeToOnnxType(t->scalarType().value()));
     }
   };
 
   if (TensorTypePtr node_type = n->type()->cast<TensorType>()) {
-    if (node_type->sizes().isComplete() || node_type->scalarType()) {
+    if (node_type->dim() || node_type->scalarType()) {
+      // Encode type if either shape or dtype exists.
       onnx::TypeProto* onnx_type = v->mutable_type();
       onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
       tensorTypeToONNXType(node_type, tensor_type);
@@ -854,7 +864,11 @@ std::string pretty_print_onnx(
 // conform to the ONNX op specification. Thus, the output will not
 // be interpretable by a ONNX-compatible framework. However, PyTorch or
 // libtorch will be able to import the IR and play it back.
-std::tuple<std::string, RawDataExportMap> export_onnx(
+std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap>
+export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
     int64_t onnx_opset_version,
@@ -887,10 +901,17 @@ std::tuple<std::string, RawDataExportMap> export_onnx(
       proto_size <= INT_MAX,
       "Exporting model exceed maximum protobuf size of 2GB. "
       "Please call torch.onnx.export with use_external_data_format=True.");
-  GRAPH_UPDATE("onnx proto:", prettyPrint(graph_encoder.get_model_proto()));
+  GRAPH_DEBUG("onnx proto:", prettyPrint(graph_encoder.get_model_proto()));
   return std::make_tuple(
-      graph_encoder.get_model_proto().SerializeAsString(),
-      graph_encoder.get_raw_data_export_map());
+      std::make_shared<::ONNX_NAMESPACE::ModelProto>(
+          graph_encoder.get_model_proto()),
+      graph_encoder.get_raw_data_export_map(),
+      graph_encoder.get_symbol_dim_param_map());
+}
+
+std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto) {
+  return model_proto->SerializeAsString();
 }
 
 void check_onnx_proto(const std::string& proto_string) {
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 212ed65207fe..e6a572163474 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -8,6 +8,10 @@
 
 #include <ostream>
 
+namespace ONNX_NAMESPACE {
+class ModelProto;
+}
+
 namespace torch {
 namespace jit {
 
@@ -21,7 +25,13 @@ namespace jit {
 // file contents being the raw tensor data.
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
-TORCH_API std::tuple<std::string, RawDataExportMap> export_onnx(
+using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+
+TORCH_API std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap>
+export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
     int64_t onnx_opset_version,
@@ -38,6 +48,9 @@ TORCH_API std::tuple<std::string, RawDataExportMap> export_onnx(
     bool use_external_data_format = false,
     const std::string& onnx_file_path = std::string());
 
+TORCH_API std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
+
 TORCH_API void check_onnx_proto(const std::string& proto_string);
 
 // For testing purposes
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 6f911f4246cc..2bc9abea8c57 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <aten/src/ATen/quantized/Quantizer.h>
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
            "this class.";
     AT_ERROR(err.str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     TORCH_CHECK(
         torch::distributed::rpc::getAllowJitRRefPickle() == true,
         "RRef jit pickling is only allowed inside RPC calls.");
@@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) {
   }
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Pickler::pushRRef(const IValue& ivalue) {
   // It is the same as how rref is pickled in python, see PyRRef::pickle
   auto rrefInterface = ivalue.toRRef();
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index b8339b5c86a7..e04339dacc22 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -1126,6 +1126,16 @@ struct PythonPrintImpl {
         stmt << useOf(node->input(0)) << ".tolist()"
              << ")";
       } break;
+      case prim::EnumValue:
+        // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue
+        // because its return type depends on type of enum and must be further
+        // resolved, but ops.prim.EnumValue construction does not provide such
+        // functionality.
+        stmt << "(" << useOf(node->input()) << ").value";
+        break;
+      case prim::EnumName:
+        stmt << "(" << useOf(node->input()) << ").name";
+        break;
       default: {
         printOpName(stmt, node->kind());
         const FunctionSchema& schema = node->schema();
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index c416f9641023..9b8fce0b4869 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <torch/csrc/jit/api/function_impl.h>
@@ -549,7 +549,7 @@ void Unpickler::readGlobal(
     stack_.emplace_back(int64_t(globals_.size() - 1));
     return;
   } else if (module_name == "torch.distributed.rpc" && class_name == "rref") {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return rebuildRRef();
 #else
     TORCH_INTERNAL_ASSERT(
@@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
     // It is the same as how rref is unpickled in python,
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index e9a211e5f959..9c065179b9d9 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -31,8 +31,7 @@ void BoundsInference::visit(const Load* v) {
 }
 
 void BoundsInference::visit(const FunctionCall* v) {
-  accesses_[v->tensor()->func_var()].push_back(
-      {kLoad, v->params(), v->params()});
+  accesses_[v->tensor()->buf()].push_back({kLoad, v->params(), v->params()});
 }
 
 void BoundsInference::visit(const Store* v) {
diff --git a/torch/csrc/jit/tensorexpr/buffer.h b/torch/csrc/jit/tensorexpr/buffer.h
deleted file mode 100644
index 26bba143deaf..000000000000
--- a/torch/csrc/jit/tensorexpr/buffer.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/tensorexpr/ir.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-// TODO: Merge this class with 'BufHandle'
-class Buffer {
- public:
-  Buffer(const BufHandle& data) : data_(data.node()) {
-    if (data_->base_handle()->dtype() != kHandle) {
-      throw malformed_input("Buffer dtype must be Handle");
-    }
-
-    std::vector<ExprHandle> stride_handles(ndim());
-    for (int i = (int)ndim() - 1; i >= 0; i--) {
-      if (i == ndim() - 1) {
-        stride_handles[i] = 1;
-      } else {
-        stride_handles[i] = stride_handles[i + 1] * ExprHandle(dim(i + 1));
-      }
-    }
-    strides_ = ExprHandleVectorToExprVector(stride_handles);
-  }
-  Buffer(
-      const std::string& name,
-      const Dtype& dtype,
-      const std::vector<ExprHandle>& dims)
-      : Buffer(BufHandle(name, dims, dtype)) {}
-
-  const Buf* data() const {
-    return data_;
-  }
-  Dtype dtype() const {
-    return data_->dtype();
-  }
-  int ndim() const {
-    return data_->ndim();
-  }
-  const Expr* dim(int index) const {
-    return data_->dim(index);
-  }
-  std::vector<const Expr*> dims() const {
-    return data_->dims();
-  }
-
-  // TODO: consider defer the storage flatten to a later stage.
-  template <typename... Args>
-  ExprHandle operator()(Args... args) const {
-    return LoadValue(std::forward<Args>(args)...);
-  }
-
-  ExprHandle LoadValue(
-      const ExprHandle& x,
-      const ExprHandle& y,
-      const ExprHandle& z) const {
-    return Load::make(*this, {x, y, z}, ExprHandle(1));
-  }
-  ExprHandle LoadValue(const ExprHandle& x, const ExprHandle& y) const {
-    return Load::make(*this, {x, y}, ExprHandle(1));
-  }
-  ExprHandle LoadValue(const ExprHandle& x) const {
-    return Load::make(*this, {x}, ExprHandle(1));
-  }
-
-  template <typename T>
-  ExprHandle call(const std::vector<T>& args) const {
-    std::vector<ExprHandle> params(args.begin(), args.end());
-    return LoadValue(params);
-  }
-
- private:
-  ExprHandle LoadValue(const std::vector<ExprHandle>& indices) const;
-
-  const Buf* data_;
-  std::vector<const Expr*> strides_;
-};
-
-inline ExprHandle Buffer::LoadValue(
-    const std::vector<ExprHandle>& indices) const {
-  return Load::make(*this, indices, ExprHandle(1));
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 271e831a7768..4bf9d7680ad0 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
@@ -71,7 +70,7 @@ class TORCH_API CodeGen {
 
 class CodeGen::BufferArg {
  public:
-  BufferArg(const Buffer& buffer)
+  BufferArg(const Placeholder& buffer)
       : var_(buffer.data()->base_handle()), dtype_(buffer.dtype()) {}
   BufferArg(Tensor* tensor)
       : var_(tensor->function()
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index a64e413657d0..6cc058657f69 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -453,7 +453,11 @@ void CudaPrinter::visit(const AtomicAdd* v) {
 }
 
 void CudaPrinter::visit(const Max* v) {
-  os() << "maximum(";
+  if (is_integral(v->dtype().scalar_type())) {
+    os() << "max(";
+  } else {
+    os() << "maximum(";
+  }
   v->lhs()->accept(this);
   os() << ",";
   v->rhs()->accept(this);
@@ -461,7 +465,11 @@ void CudaPrinter::visit(const Max* v) {
 }
 
 void CudaPrinter::visit(const Min* v) {
-  os() << "minimum(";
+  if (is_integral(v->dtype().scalar_type())) {
+    os() << "min(";
+  } else {
+    os() << "minimum(";
+  }
   v->lhs()->accept(this);
   os() << ",";
   v->rhs()->accept(this);
@@ -857,18 +865,30 @@ static std::ostream& operator<<(
   return out;
 }
 
-static const char* resource_string = R"(
+#ifdef USE_ROCM
+static const char* device_resource_string = R"(
+#include <hip/hip_runtime.h>
+#define POS_INFINITY INFINITY
+#define NEG_INFINITY -INFINITY
+
+)";
+#else
+static const char* device_resource_string = R"(
 #define NAN __int_as_float(0x7fffffff)
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define NEG_INFINITY __int_as_float(0xff800000)
 
+)";
+#endif
+
+static const char* shared_resource_string = R"(
 template<typename T>
-T maximum(T a, T b) {
+__device__ T maximum(T a, T b) {
   return isnan(a) ? a : (a > b ? a : b);
 }
 
 template<typename T>
-T minimum(T a, T b) {
+__device__ T minimum(T a, T b) {
   return isnan(a) ? a : (a < b ? a : b);
 }
 
@@ -890,7 +910,7 @@ void CudaCodeGen::Initialize() {
   metavar_rewriter_ =
       std::make_unique<GPUMetaVarRewriter>(cuda_analysis_.get());
 
-  os() << resource_string;
+  os() << device_resource_string << shared_resource_string;
 
   if (has_random_) {
     os() << philox_random_string << std::endl;
@@ -906,7 +926,19 @@ void CudaCodeGen::Initialize() {
   }
 
   std::string func_name = GetUniqueFuncName("func");
-  os() << "extern \"C\" __global__" << std::endl << "void " << func_name << "(";
+  os() << "extern \"C\" __global__" << std::endl;
+#ifdef USE_ROCM
+  // CUDA has a default limit of threads per block (=flat work group size)
+  // of 1024, but ROCm uses 256 by default. At the time of writing
+  // (#45506), I am unaware of a stricter limit that TensorExpr imposes
+  // (maybe for perf),so I use 1024 as maximum flat work group size.
+  // We put a minimum value of 1, this is also used by hip (ROCm 3.8) in
+  // the __launch_bound__ implementation. The arguments for the attribute
+  // are (min, max), for details see the documentation at
+  // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
+  os() << "__attribute__((amdgpu_flat_work_group_size(1, 1024)))" << std::endl;
+#endif
+  os() << "void " << func_name << "(";
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
diff --git a/torch/csrc/jit/tensorexpr/cuda_half_support.h b/torch/csrc/jit/tensorexpr/cuda_half_support.h
index 62e3ff21fb72..11725c2d7b55 100644
--- a/torch/csrc/jit/tensorexpr/cuda_half_support.h
+++ b/torch/csrc/jit/tensorexpr/cuda_half_support.h
@@ -38,6 +38,11 @@ class CudaHalfChecker : public IRMutator {
     return new Store(v->buf(), v->indices(), new_val, v->mask());
   }
 
+  const Expr* mutate(const HalfImm* v) override {
+    hasHalf_ = true;
+    return new Cast(kFloat, v);
+  }
+
  private:
   bool hasHalf_{false};
 };
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 7d15a001301f..b95bc643ce58 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -8,11 +8,9 @@
 #include <c10/util/Logging.h>
 #include <c10/util/math_compat.h>
 #include <c10/util/string_utils.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/codegen.h>
 #include <torch/csrc/jit/tensorexpr/exceptions.h>
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
@@ -887,7 +885,7 @@ class ExprEval {
   ExprEval(const ExprHandle& expr, const std::vector<BufferArg>& buffer_args)
       : dtype_(expr.dtype()) {
     std::vector<BufferArg> buffer_args_extended = buffer_args;
-    Buffer ret_buf("ret_val", dtype_, {1});
+    Placeholder ret_buf("ret_val", dtype_, {1});
     std::vector<const Expr*> indices;
     const Expr* zero = new IntImm(0);
     for (size_t i = 0; i < ret_buf.data()->ndim(); i++) {
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index c93fe67f1937..434aa52db815 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -208,7 +208,6 @@ class TORCH_API Buf : public ExprNode<Buf> {
   std::vector<const Expr*> dims_;
 };
 
-// TODO: Merge this class with 'Buffer'
 class TORCH_API BufHandle : public ExprHandle {
  public:
   BufHandle(
diff --git a/torch/csrc/jit/tensorexpr/function.cpp b/torch/csrc/jit/tensorexpr/function.cpp
index c86ec70dab21..e69de29bb2d1 100644
--- a/torch/csrc/jit/tensorexpr/function.cpp
+++ b/torch/csrc/jit/tensorexpr/function.cpp
@@ -1,144 +0,0 @@
-#include <torch/csrc/jit/tensorexpr/function.h>
-
-#include <c10/util/Logging.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
-#include <torch/csrc/jit/tensorexpr/dim_arg.h>
-#include <torch/csrc/jit/tensorexpr/reduction.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const VarHandle&)>& body_func) {
-  if (dim_args.size() != 1) {
-    throw malformed_input("mismatch between body and arg size (1)");
-  }
-
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarHandle(args[0])).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
-        body_func) {
-  if (dim_args.size() != 2) {
-    throw malformed_input("mismatch between body and arg size (2)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<
-        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
-        body_func) {
-  if (dim_args.size() != 3) {
-    throw malformed_input("mismatch between body and arg size (3)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body =
-      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
-          .node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(
-        const VarHandle&,
-        const VarHandle&,
-        const VarHandle&,
-        const VarHandle&)>& body_func) {
-  if (dim_args.size() != 4) {
-    throw malformed_input("mismatch between body and arg size (4)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args_nodes;
-  unpack_dim_args(dim_args, &dims, &args_nodes);
-  auto args = VarVectorToVarHandleVector(args_nodes);
-  const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
-  Function* func = new Function(func_name, dims, args_nodes, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Stmt* Function::ElementStmt(size_t index) {
-  const Buf* buf = func_var(index);
-  std::vector<const Expr*> indices;
-  for (size_t i = 0; i < buf->ndim(); i++) {
-    indices.push_back(this->args_[i]);
-  }
-
-  const Expr* mask = new IntImm(1);
-
-  Stmt* update_stmt = new Store(buf, indices, body(index), mask);
-  return update_stmt;
-}
-
-Tensor* Reduce(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const Reducer& reducer,
-    const Buffer& buffer,
-    const std::vector<DimArg>& reduce_args) {
-  return Reduce(
-      func_name,
-      dim_args,
-      reducer,
-      [&](ParameterList& p) { return buffer.call(p); },
-      reduce_args);
-}
-
-Tensor* Reduce(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const Reducer& reducer,
-    Tensor* tensor,
-    const std::vector<DimArg>& reduce_args) {
-  return Reduce(
-      func_name,
-      dim_args,
-      reducer,
-      [&](ParameterList& p) { return tensor->call(p); },
-      reduce_args);
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/function.h b/torch/csrc/jit/tensorexpr/function.h
deleted file mode 100644
index 128253d95ff0..000000000000
--- a/torch/csrc/jit/tensorexpr/function.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <vector>
-
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-class Function : public KernelScopedObject {
- public:
-  Function(
-      const std::string& func_name,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const Expr* body)
-      // TODO: Function should not create buffers, they should be created
-      // manually before constructing a function.
-      : func_vars_({new Buf(func_name, dims, body->dtype())}),
-        dims_(dims),
-        args_(args),
-        bodies_({body}) {}
-  Function(
-      const std::vector<std::string>& func_names,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const std::vector<const Expr*>& bodies)
-      : func_vars_(func_names.size()),
-        dims_(dims),
-        args_(args),
-        bodies_(bodies) {
-    for (size_t i = 0; i < func_names.size(); i++) {
-      func_vars_[i] = new Buf(func_names[i], dims, bodies[i]->dtype());
-    }
-  }
-  Function(
-      const std::string& func_name,
-      Buf* func_var,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const Expr* body)
-      : func_vars_({func_var}), dims_(dims), args_(args), bodies_({body}) {}
-
-  size_t ndim() const {
-    return dims_.size();
-  }
-
-  const Expr* dim(size_t index) const {
-    if (index < 0 || index >= dims_.size()) {
-      throw out_of_range_index();
-    }
-
-    return dims_[index];
-  }
-  const std::vector<const Expr*>& dims() const {
-    return dims_;
-  }
-
-  const Var* arg(size_t index) const {
-    if (index < 0 || index >= args_.size()) {
-      throw out_of_range_index();
-    }
-
-    return args_[index];
-  }
-  const std::vector<const Var*>& args() const {
-    return args_;
-  }
-
-  std::vector<const Expr*> bodies() const {
-    return bodies_;
-  }
-  const Expr* body(size_t index) const {
-    if (index >= bodies_.size()) {
-      throw out_of_range_index();
-    }
-
-    return bodies_[index];
-  }
-
-  std::vector<const Buf*> func_vars() const {
-    return func_vars_;
-  }
-  const Buf* func_var(size_t index) const {
-    if (index >= func_vars_.size()) {
-      throw out_of_range_index();
-    }
-    return func_vars_[index];
-  }
-
-  Stmt* ElementStmt(size_t index);
-
- private:
-  std::vector<const Buf*> func_vars_;
-  std::vector<const Expr*> dims_;
-  std::vector<const Var*> args_;
-  std::vector<const Expr*> bodies_;
-};
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index 69144bb9b7a0..56dd403c11f4 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -1,6 +1,6 @@
 #include <torch/csrc/jit/tensorexpr/ir.h>
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 
 namespace torch {
 namespace jit {
@@ -39,43 +39,40 @@ static bool indicesValid(const std::vector<const Expr*>& indices) {
   return true;
 }
 
-Load::Load(
-    const Buffer& buffer,
-    const std::vector<const Expr*>& indices,
-    const Expr* mask)
-    : Load(
-          ChooseDtype(buffer.dtype(), dtypeOfIndices(indices)),
-          buffer.data(),
-          indices,
-          mask) {}
-
-Load::Load(
-    Dtype dtype,
-    const Buf* buf,
-    const std::vector<const Expr*>& indices,
-    const Expr* mask)
-    : ExprNodeBase(dtype), buf_(buf), indices_(indices), mask_(mask) {
-  if (indices_.size() > 0 && buf->base_handle()->dtype() != kHandle) {
+void Load::verify_dtypes() const {
+  if (indices_.size() > 0 && buf_->base_handle()->dtype() != kHandle) {
     throw malformed_input(
-        "Load base handle dtype must be Handle", buf->base_handle());
+        "Load base handle dtype must be Handle", buf_->base_handle());
   }
 
-  if (!indicesValid(indices)) {
+  if (!indicesValid(indices_)) {
     throw malformed_input("invalid indices in Load");
   }
-  Dtype index_dtype = dtypeOfIndices(indices);
-  if (index_dtype.lanes() != mask->dtype().lanes()) {
+  Dtype index_dtype = dtypeOfIndices(indices_);
+  if (index_dtype.lanes() != mask_->dtype().lanes()) {
     throw malformed_input("lane mismatch in Load mask");
   }
 }
 
-ExprHandle Load::make(
-    const Buffer& buffer,
-    const std::vector<ExprHandle>& indices,
-    const ExprHandle& mask) {
-  return ExprHandle(
-      new Load(buffer, ExprHandleVectorToExprVector(indices), mask.node()));
+Load::Load(
+    Dtype dtype,
+    const Buf* buf,
+    const std::vector<const Expr*>& indices,
+    const Expr* mask)
+    : ExprNodeBase(dtype), buf_(buf), indices_(indices), mask_(mask) {
+  verify_dtypes();
 }
+
+Load::Load(
+    const Buf* buf,
+    const std::vector<const Expr*>& indices,
+    const Expr* mask)
+    : Load(
+          ChooseDtype(buf->dtype(), dtypeOfIndices(indices)),
+          buf,
+          indices,
+          mask) {}
+
 ExprHandle Load::make(
     Dtype dtype,
     const BufHandle& buf,
@@ -85,15 +82,11 @@ ExprHandle Load::make(
       dtype, buf.node(), ExprHandleVectorToExprVector(indices), mask.node()));
 }
 
-Store::Store(
-    const Buffer& buffer,
-    const std::vector<const Expr*>& indices,
-    const Expr* value,
-    const Expr* mask)
-    : Store(buffer.data(), indices, value, mask) {
-  if (buffer.dtype().scalar_type() != value->dtype().scalar_type()) {
-    throw malformed_input("invalid dtype in Store");
-  }
+ExprHandle Load::make(
+    const BufHandle& buf,
+    const std::vector<ExprHandle>& indices,
+    const ExprHandle& mask) {
+  return Load::make(buf.dtype(), buf, indices, mask);
 }
 
 Store::Store(
@@ -128,15 +121,6 @@ Store::Store(
   */
 }
 
-Store* Store::make(
-    const Buffer& buffer,
-    const std::vector<ExprHandle>& indices,
-    const ExprHandle& value,
-    const ExprHandle& mask) {
-  return new Store(
-      buffer, ExprHandleVectorToExprVector(indices), value.node(), mask.node());
-}
-
 Store* Store::make(
     const BufHandle& buf,
     const std::vector<ExprHandle>& indices,
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index d75b611145f9..7eeea564a6a7 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -55,7 +55,7 @@ inline int getPrecedence(IRNodeType ty) {
   }
 }
 
-class Buffer;
+class Placeholder;
 
 class Cast : public ExprNode<Cast> {
  public:
@@ -391,26 +391,28 @@ class TORCH_API Load : public ExprNode<Load> {
     return buf_;
   }
   static ExprHandle make(
-      const Buffer& buffer,
+      Dtype dtype,
+      const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& mask);
   static ExprHandle make(
-      Dtype dtype,
       const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& mask);
 
   Load(
-      const Buffer& buffer,
+      Dtype dtype,
+      const Buf* base_handle,
       const std::vector<const Expr*>& indices,
       const Expr* mask);
   Load(
-      Dtype dtype,
       const Buf* base_handle,
       const std::vector<const Expr*>& indices,
       const Expr* mask);
 
  private:
+  void verify_dtypes() const;
+
   const Buf* buf_;
   std::vector<const Expr*> indices_;
   const Expr* mask_;
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 4fc7336f8eac..8f47d855500f 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 3429239a4491..37c856a2e618 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -123,7 +123,7 @@ const Expr* combineMultilane(const Expr* lhs, const Expr* rhs) {
         throw malformed_input("multilane lane mismatch");
       }
       const Expr* ret = new Ramp(
-          new Op(bc->value(), ramp->base()), ramp->stride(), ramp->lanes());
+          new Op(ramp->base(), bc->value()), ramp->stride(), ramp->lanes());
       return ret;
     }
   }
@@ -1503,9 +1503,23 @@ const Expr* TermExpander::mutate(const Term* v) {
     if (lastNode) {
       // We want to avoid a leaving a CastNode on the scalar, so handle that
       // now.
-      if (v->scalar()->dtype() != lastNode->dtype()) {
-        lastNode = new Mul(
-            evaluateOp(new Cast(lastNode->dtype(), v->scalar())), lastNode);
+      auto termDtype = v->scalar()->dtype();
+      auto lastNodeDtype = lastNode->dtype();
+      if (termDtype != lastNodeDtype) {
+        const Expr* castV = v->scalar();
+        // Take care of lane mismatch first.
+        if (termDtype.lanes() != lastNodeDtype.lanes()) {
+          castV = new Broadcast(v->scalar(), lastNodeDtype.lanes());
+        }
+        // Now take care of scalar type as well.
+        if (termDtype.scalar_type() != lastNodeDtype.scalar_type()) {
+          castV = new Cast(lastNode->dtype(), castV);
+          // For scalars, we can simplify the cast further.
+          if (lastNodeDtype.lanes() == 1) {
+            castV = evaluateOp(castV);
+          }
+        }
+        lastNode = new Mul(castV, lastNode);
       } else {
         lastNode = new Mul(v->scalar(), lastNode);
       }
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 5cd414bbe2df..54eb974beb12 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -673,11 +673,20 @@ Tensor* TensorExprKernel::computeFourOperand(
       });
 }
 
+namespace {
+
+// Convert boolean to integer, if needed.
+ExprHandle boolToInteger(const ExprHandle& x) {
+  return x.dtype().scalar_type() == ScalarType::Bool ? cast<int>(x) : x;
+}
+
+} // namespace
+
 Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
   switch (v->node()->kind()) {
     case aten::add: {
       auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
-        return lhs + rhs;
+        return boolToInteger(lhs) + boolToInteger(rhs);
       };
       TORCH_INTERNAL_ASSERT(
           v->node()->inputs().size() == 2 || v->node()->inputs().size() == 3);
@@ -694,6 +703,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
 
     case aten::sub: {
       auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
+        // NB: sub isn't supported on boolean, no need to promote to integer.
         return lhs - rhs;
       };
       TORCH_INTERNAL_ASSERT(
@@ -706,35 +716,35 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     case aten::mul: {
       return computeTwoOperand(
           "aten_mul", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs * rhs;
+            return boolToInteger(lhs) * boolToInteger(rhs);
           });
     } break;
 
     case aten::div: {
       return computeTwoOperand(
           "aten_div", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs / rhs;
+            return boolToInteger(lhs) / boolToInteger(rhs);
           });
     } break;
 
     case aten::__and__: {
       return computeTwoOperand(
           "aten_and", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs & rhs;
+            return boolToInteger(lhs) & boolToInteger(rhs);
           });
     } break;
 
     case aten::__or__: {
       return computeTwoOperand(
           "aten_or", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs | rhs;
+            return boolToInteger(lhs) | boolToInteger(rhs);
           });
     } break;
 
     case aten::__xor__: {
       return computeTwoOperand(
           "aten_xor", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs ^ rhs;
+            return boolToInteger(lhs) ^ boolToInteger(rhs);
           });
     } break;
 
@@ -806,14 +816,14 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     case aten::min: {
       return computeTwoOperand(
           "aten_min", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return Min::make(lhs, rhs, false);
+            return Min::make(boolToInteger(lhs), boolToInteger(rhs), false);
           });
     } break;
 
     case aten::max: {
       return computeTwoOperand(
           "aten_max", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return Max::make(lhs, rhs, false);
+            return Max::make(boolToInteger(lhs), boolToInteger(rhs), false);
           });
     } break;
 
@@ -1254,7 +1264,7 @@ void TensorExprKernel::flattenTensors(BackendType backendType) {
     // Flatten the index for GPU kernels.
     // TODO: move this to fusing axis when it is ready.
     Tensor* newOut = Compute(
-        tensor->func_var()->name_hint() + "_flat",
+        tensor->buf()->name_hint() + "_flat",
         {totalCount},
         [tensor](const VarHandle& index) -> ExprHandle {
           std::vector<ExprHandle> dims;
@@ -1366,68 +1376,7 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
   l.prepareForCodegen();
 
   if (backendType == kLLVMCodeGen && !hasReduction) {
-    std::vector<For*> innerLoops;
-    std::vector<For*> worklist;
-
-    // Find outer-most For loops
-    if (For* rootF = dynamic_cast<For*>(l.root_stmt())) {
-      worklist.push_back(rootF);
-    } else if (Block* body = dynamic_cast<Block*>(l.root_stmt())) {
-      std::vector<Block*> blocks = {body};
-      while (blocks.size()) {
-        Block* b = blocks.back();
-        blocks.pop_back();
-
-        for (Stmt* s : *b) {
-          if (For* f = dynamic_cast<For*>(s)) {
-            worklist.push_back(f);
-          } else if (Block* b2 = dynamic_cast<Block*>(s)) {
-            blocks.push_back(b2);
-          }
-        }
-      }
-    }
-
-    // Traverse the For loop nest find inner-most loops, which are
-    // vectorization candidates.
-    while (worklist.size()) {
-      For* f = worklist.back();
-      worklist.pop_back();
-
-      bool containsSubLoops = false;
-      if (Block* body = dynamic_cast<Block*>(f->body())) {
-        for (Stmt* s2 : *body) {
-          if (For* f2 = dynamic_cast<For*>(s2)) {
-            containsSubLoops = true;
-            worklist.push_back(f2);
-          }
-        }
-      }
-
-      if (!containsSubLoops) {
-        innerLoops.push_back(f);
-      }
-    }
-
-    // vectorize inner loops.
-    for (For* loop : innerLoops) {
-      For* outer1;
-      For* split1;
-      For* tail1;
-
-      static const int kBodyVectorWidth = 8;
-      l.splitWithTail(loop, kBodyVectorWidth, &outer1, &split1, &tail1);
-      l.vectorize(split1);
-
-      if (tail1) {
-        For* outer2;
-        For* split2;
-        For* tail2;
-        static const int kTailVectorWidth = 4;
-        l.splitWithTail(tail1, kTailVectorWidth, &outer2, &split2, &tail2);
-        l.vectorize(split2);
-      }
-    }
+    l.vectorizeInnerLoops();
   }
 
   Stmt* stmt = l.root_stmt();
@@ -1500,7 +1449,7 @@ void TensorExprKernel::bindInput(const torch::jit::Value* input) {
   switch (t->kind()) {
     case TypeKind::TensorType: {
       auto tt = input->type()->cast<TensorType>();
-      Buffer inBuffer(
+      Placeholder inBuffer(
           "t" + input->debugName(),
           ToDtype(static_cast<ScalarType>(*tt->scalarType())),
           {0});
@@ -1521,7 +1470,7 @@ void TensorExprKernel::bindInput(const torch::jit::Value* input) {
                 for (size_t i = 0; i < axes.size(); i++) {
                   idx = idx + axes[i] * IntImm::make(*strides[i]);
                 }
-                return inBuffer(idx);
+                return inBuffer.load(idx);
               }));
       kernelArgs_.emplace_back(
           inBuffer, std::vector<ShapeArg>(), std::vector<ShapeArg>());
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index e4331bc2d824..2d20bd1b47d0 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -13,12 +13,14 @@
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/Scalar.h>
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
 #define DEBUG_PRINT 0
@@ -75,7 +77,7 @@ llvm::CmpInst::Predicate llvm_comparison_predicate(
 
 class LLVMCodeGenImpl : public IRVisitor {
  private:
-  llvm::orc::ThreadSafeContext context_;
+  std::unique_ptr<llvm::LLVMContext> context_;
   llvm::IRBuilder<> irb_;
   std::unique_ptr<llvm::TargetMachine> TM_;
   std::unique_ptr<llvm::orc::PytorchLLVMJIT> jit_;
@@ -299,6 +301,9 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   llvm::FunctionType* fntype = llvm::FunctionType::get(retTy, params, false);
   fn_ = llvm::Function::Create(
       fntype, llvm::Function::PrivateLinkage, "pytorch", module_.get());
+  fn_->addAttribute(
+      llvm::AttributeList::AttrIndex::FunctionIndex,
+      llvm::Attribute::AlwaysInline);
   for (size_t i = 0; i < args.size(); i++) {
     if (!args[i].isVar()) {
       fn_->addParamAttr(i, llvm::Attribute::NoAlias);
@@ -308,8 +313,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   emitWrapper(params);
   emitKernel(stmt, params);
 
-  cantFail(jit_->addModule(
-      llvm::orc::ThreadSafeModule(std::move(module_), context_)));
+  cantFail(jit_->addModule(std::move(module_), std::move(context_)));
   auto sym = jit_->findSymbol("wrapper");
   kernelAddress_ = cantFail(sym.getAddress());
   argv_ = std::make_unique<void*[]>(params.size());
@@ -318,7 +322,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
 }
 
 llvm::LLVMContext& LLVMCodeGenImpl::getContext() {
-  return *context_.getContext();
+  return *context_;
 }
 
 llvm::Type* LLVMCodeGenImpl::dtypeToLLVM(Dtype dtype) {
@@ -968,7 +972,7 @@ void LLVMCodeGenImpl::visit(const Load* v) {
       auto addr = irb_.CreateGEP(base, first_idx);
       auto vaddr = irb_.CreateBitOrPointerCast(
           addr, llvm::PointerType::get(loadType, 0));
-      value_ = irb_.CreateAlignedLoad(loadType, vaddr, 4);
+      value_ = irb_.CreateAlignedLoad(vaddr, 4);
       return;
     }
   }
@@ -1216,14 +1220,48 @@ void LLVMCodeGenImpl::visit(const BaseCallNode* v) {
 
 static void applyMathFunctionAttributes(llvm::Function* f) {
   f->addFnAttr(llvm::Attribute::ReadNone);
-  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::NoUnwind);
   // TODO: Adding this attr should be correct, but as of LLVM 9.0.1 adding it
   // causes some math functions to incorrectly be turned into tail calls.
   // f->addFnAttr(llvm::Attribute::Speculatable);
+#if LLVM_VERSION_MAJOR == 9
+  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::WillReturn);
+#endif
 }
 
+namespace {
+#if LLVM_VERSION_MAJOR == 9
+
+using FunctionCallee = llvm::FunctionCallee;
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+struct FunctionCallee {
+  FunctionCallee() {}
+
+  FunctionCallee(llvm::Constant* fn)
+      : v_(fn), ft_(cast<llvm::Function>(v_)->getFunctionType()) {}
+
+  llvm::FunctionType* getFunctionType() {
+    return ft_;
+  }
+
+  llvm::Value* getCallee() {
+    return v_;
+  }
+
+ private:
+  llvm::Value* v_{nullptr};
+  llvm::FunctionType* ft_{nullptr};
+};
+
+#else
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
+} // namespace
+
 void LLVMCodeGenImpl::visit(const Intrinsics* v) {
   llvm::FunctionType* call_ty = nullptr;
   llvm::Value* call_fn = nullptr;
@@ -1245,7 +1283,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1270,7 +1308,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1316,7 +1354,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1345,7 +1383,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1369,16 +1407,15 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
         SIMD_BINARY_MATH_CASE(kFmod, "fmodf", FloatTy_)
 #undef SIMD_BINARY_MATH_CASE
 
-#define BINARY_MATH_CASE(enum, name, type)                             \
-  case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
-        name, llvm::FunctionType::get(type, {type, type}, false), {}); \
-    call_ty = callee.getFunctionType();                                \
-    call_fn = callee.getCallee();                                      \
-    applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));  \
-  } break;
-        BINARY_MATH_CASE(kRemainder, "remainderf", FloatTy_)
-#undef BINARY_MATH_CASE
+      case kRemainder: {
+        FunctionCallee callee = module_->getOrInsertFunction(
+            "remainderf",
+            llvm::FunctionType::get(FloatTy_, {FloatTy_, FloatTy_}, false),
+            {});
+        call_ty = callee.getFunctionType();
+        call_fn = callee.getCallee();
+        applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));
+      } break;
 
       default: {
         throw unimplemented_lowering(v);
@@ -1390,7 +1427,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1415,7 +1452,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1472,7 +1509,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1501,7 +1538,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1527,7 +1564,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 
 #define BINARY_MATH_CASE(enum, name, type)                             \
   case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
+    FunctionCallee callee = module_->getOrInsertFunction(              \
         name, llvm::FunctionType::get(type, {type, type}, false), {}); \
     call_ty = callee.getFunctionType();                                \
     call_fn = callee.getCallee();                                      \
@@ -1642,6 +1679,8 @@ void LLVMCodeGenImpl::optimize(llvm::Module& M) {
   PMB.populateFunctionPassManager(FPM);
   PMB.populateModulePassManager(PM);
   FPM.doInitialization();
+  PM.add(llvm::createDeadCodeEliminationPass());
+  PM.add(llvm::createAlwaysInlinerLegacyPass());
   PM.run(M);
   for (auto& FF : M) {
     FPM.run(FF);
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index c4cc9337ce16..8a8e2bf48513 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -2,7 +2,23 @@
 
 #include <torch/csrc/jit/tensorexpr/llvm_jit.h>
 
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
 #include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/SymbolStringPool.h>
+#include <llvm/ExecutionEngine/RTDyldMemoryManager.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Mangler.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+
 #include <sleef.h>
 #include <algorithm>
 #include <memory>
@@ -14,6 +30,7 @@ namespace orc {
 
 // Lightly modified implementation from LLVM's Kaleidoscope JIT tutorial:
 // https://llvm.org/docs/tutorial/BuildingAJIT1.html
+#if LLVM_VERSION_MAJOR == 9
 class TORCH_API PytorchLLVMJITImpl {
  private:
   std::unique_ptr<LLJIT> LLJ;
@@ -420,8 +437,9 @@ class TORCH_API PytorchLLVMJITImpl {
 #endif
   }
 
-  Error addModule(ThreadSafeModule M) {
-    if (auto Err = LLJ->addIRModule(std::move(M))) {
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C) {
+    if (auto Err =
+            LLJ->addIRModule(ThreadSafeModule(std::move(M), std::move(C)))) {
       return Err;
     }
     return Error::success();
@@ -441,8 +459,99 @@ PytorchLLVMJIT::PytorchLLVMJIT()
 
 PytorchLLVMJIT::~PytorchLLVMJIT() = default;
 
-Error PytorchLLVMJIT::addModule(ThreadSafeModule M) {
-  return impl_->addModule(std::move(M));
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  return impl_->addModule(std::move(M), std::move(C));
+}
+
+JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
+  return impl_->findSymbol(std::move(Name));
+}
+
+const DataLayout& PytorchLLVMJIT::getDataLayout() {
+  return impl_->getDataLayout();
+}
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+class TORCH_API PytorchLLVMJITImpl {
+ private:
+  ExecutionSession ES;
+  std::shared_ptr<SymbolResolver> Resolver;
+  std::unique_ptr<TargetMachine> TM;
+  const DataLayout DL;
+  RTDyldObjectLinkingLayer ObjectLayer;
+  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+
+ public:
+  PytorchLLVMJITImpl()
+      : Resolver(createLegacyLookupResolver(
+            ES,
+            [this](const std::string& Name) -> JITSymbol {
+              if (auto Sym = CompileLayer.findSymbol(Name, false))
+                return Sym;
+              else if (auto Err = Sym.takeError())
+                return std::move(Err);
+              if (auto SymAddr =
+                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
+                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
+              return nullptr;
+            },
+            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
+        TM(EngineBuilder().selectTarget()),
+        DL(TM->createDataLayout()),
+        ObjectLayer(
+            ES,
+            [this](VModuleKey) {
+              return RTDyldObjectLinkingLayer::Resources{
+                  std::make_shared<SectionMemoryManager>(), Resolver};
+            }),
+        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
+    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  }
+
+  TargetMachine& getTargetMachine() {
+    return *TM;
+  }
+
+  VModuleKey addModule(std::unique_ptr<Module> M) {
+    // Add the module to the JIT with a new VModuleKey.
+    auto K = ES.allocateVModule();
+    cantFail(CompileLayer.addModule(K, std::move(M)));
+    return K;
+  }
+
+  JITSymbol findSymbol(const std::string Name) {
+    std::string MangledName;
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+    return CompileLayer.findSymbol(MangledNameStream.str(), true);
+  }
+
+  JITTargetAddress getSymbolAddress(const std::string Name) {
+    return cantFail(findSymbol(Name).getAddress());
+  }
+
+  void removeModule(VModuleKey K) {
+    cantFail(CompileLayer.removeModule(K));
+  }
+
+  const DataLayout& getDataLayout() {
+    return DL;
+  }
+};
+
+PytorchLLVMJIT::PytorchLLVMJIT()
+    : impl_(std::make_unique<PytorchLLVMJITImpl>()) {}
+
+PytorchLLVMJIT::~PytorchLLVMJIT() = default;
+
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  impl_->addModule(std::move(M));
+  return Error::success();
 }
 
 JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
@@ -453,6 +562,10 @@ const DataLayout& PytorchLLVMJIT::getDataLayout() {
   return impl_->getDataLayout();
 }
 
+#else // LLVM_VERSION_MAJOR
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 0a96efd1298a..fa73cdfca3bc 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -21,11 +21,12 @@ class TORCH_API PytorchLLVMJIT {
   PytorchLLVMJIT();
   ~PytorchLLVMJIT();
 
-  Error addModule(ThreadSafeModule M);
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
 
   JITSymbol findSymbol(const std::string Name);
 
   TargetMachine& getTargetMachine();
+
   const DataLayout& getDataLayout();
 
  private:
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index f80e4585b790..456a264006e1 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -23,17 +23,6 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-namespace {
-
-// Evaluates a constant expression and returns its value.
-template <typename T>
-static T EvalConstExpr(const ExprHandle& expr) {
-  ExprEval<SimpleIREvaluator> eval(expr);
-  return eval.value<T>();
-}
-
-} // namespace
-
 class IndexFlattener : public IRMutator {
  public:
   Stmt* flatten(Stmt* s) {
@@ -353,13 +342,13 @@ class Flattener : public IRMutator {
   Expr* mutate(const FunctionCall* v) override {
     const Tensor* t = v->tensor();
     const Buf* b = t->buf();
-    Buffer buffer = Buffer(BufHandle(b));
+    Placeholder buffer = Placeholder(BufHandle(b));
     const std::vector<const Expr*>& params = v->params();
     std::vector<ExprHandle> params_expr(params.size());
     for (size_t i = 0; i < params.size(); i++) {
       params_expr[i] = ExprHandle(params[i]);
     }
-    return buffer(params_expr).node();
+    return buffer.load(params_expr).node();
   }
 };
 
@@ -470,7 +459,7 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) {
 
   const Expr* initializer = t->initializer();
   if (initializer) {
-    buf_initializers_[t->func_var()] = initializer;
+    buf_initializers_[t->buf()] = initializer;
   }
   std::vector<const Expr*> indices(t->args().begin(), t->args().end());
 
@@ -517,7 +506,7 @@ class FunctionInliner : public IRMutator {
 
     if (v->nparams() != buf->ndim()) {
       throw malformed_input(
-          "Buffer indexed access is inconsistent with its rank", v);
+          "Placeholder indexed access is inconsistent with its rank", v);
     }
 
     std::vector<const Var*> index_vars;
@@ -848,6 +837,71 @@ void LoopNest::prepareForCodegen() {
   root_stmt_ = insertAllocFree(root_stmt_);
 }
 
+void LoopNest::vectorizeInnerLoops() {
+  std::vector<For*> innerLoops;
+  std::vector<For*> worklist;
+
+  // Find outer-most For loops
+  if (For* rootF = dynamic_cast<For*>(root_stmt_)) {
+    worklist.push_back(rootF);
+  } else if (Block* body = dynamic_cast<Block*>(root_stmt_)) {
+    std::vector<Block*> blocks = {body};
+    while (blocks.size()) {
+      Block* b = blocks.back();
+      blocks.pop_back();
+
+      for (Stmt* s : *b) {
+        if (For* f = dynamic_cast<For*>(s)) {
+          worklist.push_back(f);
+        } else if (Block* b2 = dynamic_cast<Block*>(s)) {
+          blocks.push_back(b2);
+        }
+      }
+    }
+  }
+
+  // Traverse the For loop nest find inner-most loops, which are
+  // vectorization candidates.
+  while (worklist.size()) {
+    For* f = worklist.back();
+    worklist.pop_back();
+
+    bool containsSubLoops = false;
+    if (Block* body = dynamic_cast<Block*>(f->body())) {
+      for (Stmt* s2 : *body) {
+        if (For* f2 = dynamic_cast<For*>(s2)) {
+          containsSubLoops = true;
+          worklist.push_back(f2);
+        }
+      }
+    }
+
+    if (!containsSubLoops) {
+      innerLoops.push_back(f);
+    }
+  }
+
+  // vectorize inner loops.
+  for (For* loop : innerLoops) {
+    For* outer1;
+    For* split1;
+    For* tail1;
+
+    static const int kBodyVectorWidth = 8;
+    splitWithTail(loop, kBodyVectorWidth, &outer1, &split1, &tail1);
+    vectorize(split1);
+
+    if (tail1) {
+      For* outer2;
+      For* split2;
+      For* tail2;
+      static const int kTailVectorWidth = 4;
+      splitWithTail(tail1, kTailVectorWidth, &outer2, &split2, &tail2);
+      vectorize(split2);
+    }
+  }
+}
+
 void LoopNest::sliceHead(For* f, int factor, For** head, For** tail) {
   if (dynamic_cast<const IntImm*>(f->start()) &&
       dynamic_cast<const IntImm*>(f->stop())) {
@@ -1007,10 +1061,11 @@ void LoopNest::splitWithMask(For* f, int factor, For** outer, For** inner) {
   }
 
   bool tail_is_needed = true;
-  if (dynamic_cast<const IntImm*>(f->start()) &&
-      dynamic_cast<const IntImm*>(f->stop())) {
-    int start_val = dynamic_cast<const IntImm*>(f->start())->value();
-    int stop_val = dynamic_cast<const IntImm*>(f->stop())->value();
+  const Expr* start = IRSimplifier::simplify(f->start());
+  const Expr* stop = IRSimplifier::simplify(f->stop());
+  if (start->isConstant() && stop->isConstant()) {
+    int start_val = immediateAs<int>(start);
+    int stop_val = immediateAs<int>(stop);
     int size_val = stop_val - start_val;
     int tail_size = size_val % factor;
     if (tail_size == 0) {
@@ -1351,7 +1406,7 @@ class LoopComputeAtRewriter : public IRMutator {
     return new Load(v->dtype(), new_buf_, new_indices, v->mask());
   }
   const Expr* mutate(const FunctionCall* v) override {
-    if (v->tensor()->func_var() != buf_) {
+    if (v->tensor()->buf() != buf_) {
       return v;
     }
     std::vector<const Expr*> new_indices;
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 4aa975475a67..391bdbeb1c37 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -71,6 +71,10 @@ class TORCH_API LoopNest {
 
   void prepareForCodegen();
 
+  // Find the inner-most loops and vectorize them. Currently, this only works
+  // for the LLVM backend, when no reductions are involved.
+  void vectorizeInnerLoops();
+
  private:
   std::vector<Tensor*> findAllNeededTensors(
       const std::vector<Tensor*>& tensors);
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h
index 1f2358d203ed..88f833819744 100644
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
 #include <functional>
@@ -91,7 +91,7 @@ class Reducer {
   Reducer(ExprHandle init, ReduceInteraction& interaction)
       : init_(init.node()), interaction_(interaction) {}
 
-  Reducer(ExprHandle init, ReduceInteraction& interaction, Buffer& buf)
+  Reducer(ExprHandle init, ReduceInteraction& interaction, Placeholder& buf)
       : init_(init.node()), interaction_(interaction) {}
 
   template <typename RI>
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index eb0b7837c5c6..1d3712134335 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -10,7 +10,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-class Buffer;
+class Placeholder;
 
 // The common base between all statement node.
 class TORCH_API Stmt : public KernelScopedObject {
@@ -270,12 +270,6 @@ class TORCH_API Store : public StmtNode<Store> {
     return buf_;
   }
 
-  static Store* make(
-      const Buffer& buffer,
-      const std::vector<ExprHandle>& indices,
-      const ExprHandle& value,
-      const ExprHandle& mask);
-
   static Store* make(
       const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
@@ -287,13 +281,6 @@ class TORCH_API Store : public StmtNode<Store> {
       const std::vector<ExprHandle>& indices,
       const ExprHandle& value);
 
-  // TODO: merge this with Load.
-  Store(
-      const Buffer& buffer,
-      const std::vector<const Expr*>& indices,
-      const Expr* value,
-      const Expr* mask);
-
   Store(
       const Buf* buf,
       std::vector<const Expr*> indices,
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index f986cd663dc8..4fad4cac9a6d 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -1,7 +1,137 @@
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
+#include <c10/util/Logging.h>
+#include <torch/csrc/jit/tensorexpr/dim_arg.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
 namespace torch {
 namespace jit {
-namespace tensorexpr {} // namespace tensorexpr
+namespace tensorexpr {
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
+  Function* func = new Function(func_name, dims, args, body);
+  return new Tensor(func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const VarHandle&)>& body_func) {
+  if (dim_args.size() != 1) {
+    throw malformed_input("mismatch between body and arg size (1)");
+  }
+
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarHandle(args[0])).node();
+  Function* func = new Function(func_name, dims, args, body);
+  return new Tensor(func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func) {
+  if (dim_args.size() != 2) {
+    throw malformed_input("mismatch between body and arg size (2)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
+  Function* func = new Function(func_name, dims, args, body);
+  return new Tensor(func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func) {
+  if (dim_args.size() != 3) {
+    throw malformed_input("mismatch between body and arg size (3)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body =
+      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
+          .node();
+  Function* func = new Function(func_name, dims, args, body);
+  return new Tensor(func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func) {
+  if (dim_args.size() != 4) {
+    throw malformed_input("mismatch between body and arg size (4)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args_nodes;
+  unpack_dim_args(dim_args, &dims, &args_nodes);
+  auto args = VarVectorToVarHandleVector(args_nodes);
+  const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
+  Function* func = new Function(func_name, dims, args_nodes, body);
+  return new Tensor(func, 0);
+}
+
+Stmt* Function::ElementStmt(size_t index) {
+  const Buf* buf = func_var(index);
+  std::vector<const Expr*> indices;
+  for (size_t i = 0; i < buf->ndim(); i++) {
+    indices.push_back(this->args_[i]);
+  }
+
+  const Expr* mask = new IntImm(1);
+
+  Stmt* update_stmt = new Store(buf, indices, body(index), mask);
+  return update_stmt;
+}
+
+Tensor* Reduce(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const Reducer& reducer,
+    const Placeholder& buffer,
+    const std::vector<DimArg>& reduce_args) {
+  return Reduce(
+      func_name,
+      dim_args,
+      reducer,
+      [&](ParameterList& p) { return buffer.load(p); },
+      reduce_args);
+}
+
+Tensor* Reduce(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const Reducer& reducer,
+    Tensor* tensor,
+    const std::vector<DimArg>& reduce_args) {
+  return Reduce(
+      func_name,
+      dim_args,
+      reducer,
+      [&](ParameterList& p) { return tensor->call(p); },
+      reduce_args);
+}
+
+} // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 7ed32a905ead..9d0cadc52686 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -1,19 +1,112 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <functional>
 #include <vector>
 
 #include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+class Function : public KernelScopedObject {
+ public:
+  Function(
+      const std::string& func_name,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const Expr* body)
+      // TODO: Function should not create buffers, they should be created
+      // manually before constructing a function.
+      : func_vars_({new Buf(func_name, dims, body->dtype())}),
+        dims_(dims),
+        args_(args),
+        bodies_({body}) {}
+  Function(
+      const std::vector<std::string>& func_names,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const std::vector<const Expr*>& bodies)
+      : func_vars_(func_names.size()),
+        dims_(dims),
+        args_(args),
+        bodies_(bodies) {
+    for (size_t i = 0; i < func_names.size(); i++) {
+      func_vars_[i] = new Buf(func_names[i], dims, bodies[i]->dtype());
+    }
+  }
+  Function(
+      const std::string& func_name,
+      Buf* func_var,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const Expr* body)
+      : func_vars_({func_var}), dims_(dims), args_(args), bodies_({body}) {}
+
+  size_t ndim() const {
+    return dims_.size();
+  }
+
+  const Expr* dim(size_t index) const {
+    if (index < 0 || index >= dims_.size()) {
+      throw out_of_range_index();
+    }
+
+    return dims_[index];
+  }
+  const std::vector<const Expr*>& dims() const {
+    return dims_;
+  }
+
+  const Var* arg(size_t index) const {
+    if (index < 0 || index >= args_.size()) {
+      throw out_of_range_index();
+    }
+
+    return args_[index];
+  }
+  const std::vector<const Var*>& args() const {
+    return args_;
+  }
+
+  std::vector<const Expr*> bodies() const {
+    return bodies_;
+  }
+  const Expr* body(size_t index) const {
+    if (index >= bodies_.size()) {
+      throw out_of_range_index();
+    }
+
+    return bodies_[index];
+  }
+
+  std::vector<const Buf*> func_vars() const {
+    return func_vars_;
+  }
+  const Buf* func_var(size_t index) const {
+    if (index >= func_vars_.size()) {
+      throw out_of_range_index();
+    }
+    return func_vars_[index];
+  }
+
+  Stmt* ElementStmt(size_t index);
+
+ private:
+  std::vector<const Buf*> func_vars_;
+  std::vector<const Expr*> dims_;
+  std::vector<const Var*> args_;
+  std::vector<const Expr*> bodies_;
+};
+
 class Tensor : KernelScopedObject {
  public:
+  Tensor(Function* function, int output_index)
+      : function_(function), output_index_(output_index) {}
+
   Function* function() const {
     return function_;
   }
@@ -25,17 +118,17 @@ class Tensor : KernelScopedObject {
   const Expr* body() const {
     return function()->body(output_index());
   }
-  const Buf* func_var() const {
+  const Buf* buf() const {
     return function()->func_var(output_index());
   }
   int ndim() const {
-    return buf_->dims().size();
+    return buf()->dims().size();
   }
   const Expr* dim(int index) const {
-    return buf_->dim(index);
+    return buf()->dim(index);
   }
   std::vector<const Expr*> dims() const {
-    return buf_->dims();
+    return buf()->dims();
   }
   const Var* arg(int index) const {
     return function()->arg(index);
@@ -44,10 +137,6 @@ class Tensor : KernelScopedObject {
     return function()->args();
   }
 
-  const Buf* buf() const {
-    return buf_;
-  }
-
   void initializeTo(const Expr* initializer) {
     initializer_ = initializer;
   }
@@ -55,8 +144,6 @@ class Tensor : KernelScopedObject {
     return initializer_;
   }
 
-  Tensor(const Buf* buf, Function* function, int output_index)
-      : buf_(buf), function_(function), output_index_(output_index) {}
   template <typename... Ts>
   inline ExprHandle operator()(const Ts&... ts);
   template <typename T>
@@ -65,12 +152,83 @@ class Tensor : KernelScopedObject {
   inline ExprHandle call(const Ts&... ts);
 
  private:
-  const Buf* buf_;
   Function* function_;
   int output_index_;
   const Expr* initializer_{nullptr};
 };
 
+class Placeholder {
+ public:
+  Placeholder(const BufHandle& data) : data_(data.node()) {
+    if (data_->base_handle()->dtype() != kHandle) {
+      throw malformed_input("Placeholder dtype must be Handle");
+    }
+
+    std::vector<ExprHandle> stride_handles(ndim());
+    for (int i = (int)ndim() - 1; i >= 0; i--) {
+      if (i == ndim() - 1) {
+        stride_handles[i] = 1;
+      } else {
+        stride_handles[i] = stride_handles[i + 1] * ExprHandle(dim(i + 1));
+      }
+    }
+    strides_ = ExprHandleVectorToExprVector(stride_handles);
+  }
+  Placeholder(
+      const std::string& name,
+      const Dtype& dtype,
+      const std::vector<ExprHandle>& dims)
+      : Placeholder(BufHandle(name, dims, dtype)) {}
+
+  const Buf* data() const {
+    return data_;
+  }
+  Dtype dtype() const {
+    return data_->dtype();
+  }
+  int ndim() const {
+    return data_->ndim();
+  }
+  const Expr* dim(int index) const {
+    return data_->dim(index);
+  }
+  std::vector<const Expr*> dims() const {
+    return data_->dims();
+  }
+
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
+
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+
+  inline ExprHandle loadWithMask(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& mask) const {
+    return ExprHandle(
+        new Load(data(), ExprHandleVectorToExprVector(args), mask.node()));
+  }
+
+  inline Store* store(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& val) const {
+    return new Store(
+        data(), ExprHandleVectorToExprVector(args), val.node(), new IntImm(1));
+  }
+
+  inline Store* storeWithMask(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& val,
+      const ExprHandle& mask) const {
+    return new Store(
+        data(), ExprHandleVectorToExprVector(args), val.node(), mask.node());
+  }
+
+ private:
+  const Buf* data_;
+  std::vector<const Expr*> strides_;
+};
+
 TORCH_API Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
@@ -140,7 +298,7 @@ Tensor* Reduce(
   dims.insert(dims.end(), reduce_dims.begin(), reduce_dims.end());
   Function* func =
       new Function(func_name, func_result, dims, all_vars, reduce_op);
-  Tensor* t = new Tensor(func_result, func, 0);
+  Tensor* t = new Tensor(func, 0);
   t->initializeTo(new Cast(body.dtype(), reducer.initializer()));
   return t;
 }
@@ -156,12 +314,12 @@ Tensor* Reduce(
   return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
 }
 
-// Overload for the common case of all dimensions of a Buffer.
+// Overload for the common case of all dimensions of a Placeholder.
 TORCH_API Tensor* Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args);
 
 // Overload for the common case of all dimensions of a prevously Computed
@@ -207,7 +365,7 @@ class FunctionCall : public CallNode<FunctionCall> {
   }
 
   std::string func_name() const override {
-    return tensor_->func_var()->name_hint();
+    return tensor_->buf()->name_hint();
   }
 
   Tensor* tensor_;
@@ -229,6 +387,21 @@ inline ExprHandle Tensor::call(const std::vector<T>& args) {
   std::vector<ExprHandle> params(args.begin(), args.end());
   return FunctionCall::make(this, params);
 }
+
+template <typename... Ts>
+inline ExprHandle Placeholder::load(const Ts&... ts) const {
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return ExprHandle(
+      new Load(data(), ExprHandleVectorToExprVector(params), new IntImm(1)));
+}
+
+template <typename T>
+inline ExprHandle Placeholder::load(const std::vector<T>& args) const {
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return ExprHandle(
+      new Load(data(), ExprHandleVectorToExprVector(params), new IntImm(1)));
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h
index 8dd67c8b7125..8e39ad231545 100644
--- a/torch/csrc/jit/tensorexpr/types.h
+++ b/torch/csrc/jit/tensorexpr/types.h
@@ -124,6 +124,10 @@ inline Dtype BinaryOpDtype(
     Dtype op1_dtype,
     Dtype op2_dtype,
     ScalarType ret_type = ScalarType::None) {
+  if (op1_dtype.scalar_type() == ScalarType::Bool ||
+      op2_dtype.scalar_type() == ScalarType::Bool) {
+    throw malformed_input("arithmetic binary operations on Bool not supported");
+  }
   if (op1_dtype == op2_dtype) {
     if (ret_type == ScalarType::None) {
       return op1_dtype;
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
index 900de0ea57c2..9daced8a3e4d 100644
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@@ -20,5 +20,5 @@ enum class TrainingMode {
 // onnx::IR_VERSION. with this change, the test_operators.py will be more
 // stable. only bump it when it's necessary
 static const size_t IR_VERSION = 6;
-static const char* PRODUCER_VERSION = "1.7";
+static const char* PRODUCER_VERSION = "1.8";
 }} // namespace torch::onnx
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index c6ef331de2b9..010ea498c3e7 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -115,6 +115,9 @@
 #define THPQInt32Utils_checkReal(object)         THPUtils_checkReal_INT(object)
 #define THPQInt32Utils_unpackReal(object)        (int)THPUtils_unpackReal_INT(object)
 #define THPQInt32Utils_newReal(value)           THPUtils_newReal_INT(value)
+#define THPQUInt4x2Utils_checkReal(object)      THPUtils_checkReal_INT(object)
+#define THPQUInt4x2Utils_unpackReal(object)     (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt4x2Utils_newReal(value)         THPUtils_newReal_INT(value)
 
 
 #define THPUtils_assert(cond, ...) THPUtils_assertRet(nullptr, cond, __VA_ARGS__)
diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h
index 6d672ee86cd5..093d043ecf7d 100644
--- a/torch/csrc/utils/future.h
+++ b/torch/csrc/utils/future.h
@@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception {
 // Most implementation is copied from FutureMessage and
 // c10::ivalue::Future
 template <typename T>
-class TORCH_API Future final {
+class TORCH_PYTHON_API Future final {
  public:
   Future() = default;
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index e954bef398e9..81c55b83bf8c 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -901,6 +901,7 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
   print_error(self, args, kwargs, parsed_args);
 }
 
+
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<int> plausible_idxs;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a641fbda2013..0454e7e2af51 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -173,6 +173,8 @@ struct PythonArgs {
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
+  inline std::vector<double> doublelist(int i);
+  inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -369,10 +371,7 @@ inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
   return intlist(i);
 }
 
-inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
-  if (!args[i]) {
-    return {};
-  }
+inline std::vector<double> PythonArgs::getDoublelist(int i) {
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -390,6 +389,20 @@ inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
   return res;
 }
 
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
+inline std::vector<double> PythonArgs::doublelist(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
 inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
   if (!args[i]) return default_scalartype;
   return scalartype(i);
@@ -712,9 +725,6 @@ static py::object PyTorch_LookupSpecial(PyObject *obj, char* name)
   if (_is_basic_python_type(tp)) {
     return py::object();
   }
-  if(PyObject_HasAttrString(obj, name) == 0){
-    return py::object();
-  }
   return PyObject_FastGetAttrString((PyObject *)tp, name);
 }
 
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 21bf8e69adc4..f0f63bf7a2f0 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -27,12 +27,13 @@ torch::Library::Kind parseKind(const std::string& k) {
 
 c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
   static std::unordered_map<std::string, c10::DispatchKey> key_map = {
-    {"cpu", c10::DispatchKey::CPU},
-    {"cuda", c10::DispatchKey::CUDA},
-    {"xla", c10::DispatchKey::XLA},
-    {"math", c10::DispatchKey::Math},
-    {"autograd", c10::DispatchKey::Autograd},
-    {"autogradcpu", c10::DispatchKey::AutogradCPU},
+    {"CPU", c10::DispatchKey::CPU},
+    {"CUDA", c10::DispatchKey::CUDA},
+    {"XLA", c10::DispatchKey::XLA},
+    {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
+    {"Math", c10::DispatchKey::Math},
+    {"Autograd", c10::DispatchKey::Autograd},
+    {"AutogradCPU", c10::DispatchKey::AutogradCPU},
     {"", c10::DispatchKey::Undefined},
   };
   auto it = key_map.find(k);
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index d15e75fe8f38..724b7e35d8a1 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -49,6 +49,8 @@ std::pair<std::string, std::string> getDtypeNames(
       return std::make_pair("qint32", "");
     case at::ScalarType::BFloat16:
       return std::make_pair("bfloat16", "");
+    case at::ScalarType::QUInt4x2:
+      return std::make_pair("quint4x2", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 53aea1141d47..1176c6ee3060 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -96,11 +96,11 @@ def _check_cubins():
     supported_sm = [int(arch.split('_')[1]) for arch in arch_list if 'sm_' in arch]
     for idx in range(device_count()):
         cap_major, cap_minor = get_device_capability(idx)
-        capability = cap_major * 10 + cap_minor
-        # NVIDIA GPU compute architectures are backward compatible within 5 minor revisions versions
-        supported = any([capability >= sm and capability - (sm // 5) * 5 < 5 for sm in supported_sm])
+        # NVIDIA GPU compute architectures are backward compatible within major version
+        supported = any([sm // 10 == cap_major for sm in supported_sm])
         if not supported:
             device_name = get_device_name(idx)
+            capability = cap_major * 10 + cap_minor
             warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name))
 
 
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index 8bac02fc39f0..15cb5c956231 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -149,7 +149,11 @@ def _cast(value, dtype):
     elif isinstance(value, container_abcs.Mapping):
         return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
     elif isinstance(value, container_abcs.Iterable):
-        return type(value)(_cast(v, dtype) for v in value)
+        iterable = map(lambda v: _cast(v, dtype), value)
+        if isinstance(value, list) or isinstance(value, tuple):
+            return type(value)(iterable)
+        else:
+            return iterable
     else:
         return value
 
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index 066ff1a0d311..dc35d21daf00 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -3,18 +3,19 @@
 from torch._six import container_abcs
 import warnings
 from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
 
 
 class _MultiDeviceReplicator(object):
     """
     Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
     """
-    def __init__(self, master_tensor):
+    def __init__(self, master_tensor: torch.Tensor) -> None:
         assert master_tensor.is_cuda
         self.master = master_tensor
-        self._per_device_tensors = {}
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
 
-    def get(self, device):
+    def get(self, device) -> torch.Tensor:
         retval = self._per_device_tensors.get(device, None)
         if retval is None:
             retval = self.master.to(device=device, non_blocking=True, copy=True)
@@ -38,6 +39,9 @@ def _refresh_per_optimizer_state():
 
 
 class GradScaler(object):
+    _scale: Optional[torch.Tensor]
+    _grows_tracker: Optional[torch.Tensor]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
     """
     An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
     conveniently.
@@ -128,10 +132,11 @@ def __init__(self,
             self._growth_tracker = None
             self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
 
-    def _check_scale_growth_tracker(self, funcname):
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
         fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
         assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix
         assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix
+        return (self._scale, self._growth_tracker)
 
     def _lazy_init_scale_growth_tracker(self, dev):
         assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
@@ -156,21 +161,27 @@ def scale(self, outputs):
             assert outputs.is_cuda
             if self._scale is None:
                 self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
             return outputs * self._scale.to(device=outputs.device, non_blocking=True)
 
         # Invoke the more complex machinery only if we're treating multiple outputs.
-        stash = [None]  # trick to hold a reference that can be overwritten at any level of the recursion below.
+        stash: List[_MultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
 
         def apply_scale(val):
             if isinstance(val, torch.Tensor):
                 assert val.is_cuda
-                if self._scale is None:
-                    self._lazy_init_scale_growth_tracker(val.device)
-                if stash[0] is None:
-                    stash[0] = _MultiDeviceReplicator(self._scale)
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
                 return val * stash[0].get(val.device)
             elif isinstance(val, container_abcs.Iterable):
-                return type(val)(apply_scale(v) for v in val)
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
             else:
                 raise ValueError("outputs must be a Tensor or an iterable of Tensors")
 
@@ -180,27 +191,39 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
         per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
         per_device_found_inf = _MultiDeviceReplicator(found_inf)
 
-        for group in optimizer.param_groups:
-            for param in group["params"]:
-                if param.grad is not None:
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
                     if (not allow_fp16) and param.grad.dtype == torch.float16:
                         raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
                     else:
-                        with torch.no_grad():
-                            if param.grad.is_sparse:
-                                # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                                # coalesce() deduplicates indices and adds all values that have the same index.
-                                # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                                # so we should check the coalesced _values().
-                                if param.grad.dtype is torch.float16:
-                                    param.grad = param.grad.coalesce()
-                                to_unscale = param.grad._values()
-                            else:
-                                to_unscale = param.grad
-
-                            torch._amp_non_finite_check_and_unscale_(to_unscale,
-                                                                     per_device_found_inf.get(param.grad.device),
-                                                                     per_device_inv_scale.get(param.grad.device))
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(grads,
+                                                                     per_device_found_inf.get(device),
+                                                                     per_device_inv_scale.get(device))
 
         return per_device_found_inf._per_device_tensors
 
@@ -249,6 +272,7 @@ def unscale_(self, optimizer):
             raise RuntimeError("unscale_() is being called after step().")
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
         inv_scale = self._scale.double().reciprocal().float()
         found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
 
@@ -332,22 +356,22 @@ def update(self, new_scale=None):
         if not self._enabled:
             return
 
-        self._check_scale_growth_tracker("update")
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
 
         if new_scale is not None:
             # Accept a new user-defined scale.
             if isinstance(new_scale, float):
-                self._scale = torch.full((1,), new_scale, dtype=torch.float32, device=self._scale.device)
+                self._scale = torch.full((1,), new_scale, dtype=torch.float32, device=_scale.device)
             else:
                 reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
-                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
                 assert new_scale.numel() == 1, reason
                 assert new_scale.requires_grad is False, reason
                 self._scale = new_scale
         else:
             # Consume shared inf/nan data collected from optimizers to update the scale.
             # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-            found_infs = [found_inf.to(device=self._scale.device, non_blocking=True)
+            found_infs = [found_inf.to(device=_scale.device, non_blocking=True)
                           for state in self._per_optimizer_states.values()
                           for found_inf in state["found_inf_per_device"].values()]
 
@@ -358,8 +382,8 @@ def update(self, new_scale=None):
                 for i in range(1, len(found_infs)):
                     found_inf_combined += found_infs[i]
 
-            self._scale = torch._amp_update_scale(self._growth_tracker,
-                                                  self._scale,
+            self._scale = torch._amp_update_scale(_growth_tracker,
+                                                  _scale,
                                                   found_inf_combined,
                                                   self._growth_factor,
                                                   self._backoff_factor,
@@ -498,10 +522,10 @@ def __setstate__(self, state):
         self.__dict__.update(state)
 
     def _check_inf_per_device(self, optimizer):
-        self._check_scale_growth_tracker("_check_inf_per_device")
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
 
-        dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=self._scale.device)
-        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+        dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=_scale.device)
 
         self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
             self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
diff --git a/torch/cuda/comm.py b/torch/cuda/comm.py
index f9856eda380f..557ffb0c0de4 100644
--- a/torch/cuda/comm.py
+++ b/torch/cuda/comm.py
@@ -2,4 +2,4 @@
 from torch.nn.parallel.comm import broadcast, broadcast_coalesced, reduce_add, \
     reduce_add_coalesced, scatter, gather
 
-__all__ = [broadcast, broadcast_coalesced, reduce_add, reduce_add_coalesced, scatter, gather]
+__all__ = ['broadcast', 'broadcast_coalesced', 'reduce_add', 'reduce_add_coalesced', 'scatter', 'gather']
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index 9ce7a51f0dd1..94108a3dadad 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -3,6 +3,7 @@
 
 import torch._six
 import torch.cuda
+from typing import Optional, Sequence, Union
 
 
 __all__ = ['all_reduce', 'reduce', 'broadcast', 'all_gather', 'reduce_scatter']
@@ -43,7 +44,7 @@ def init_rank(num_ranks, uid, rank):
     return torch._C._nccl_init_rank(num_ranks, uid, rank)
 
 
-def _check_sequence_type(inputs):
+def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
     if not isinstance(inputs, collections.Container) or isinstance(inputs, torch.Tensor):
         raise TypeError("Inputs should be a collection of tensors")
 
@@ -58,8 +59,15 @@ def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
 
 # `output` used to be `outputs`, taking in a list of tensors. So we have two
 # arguments for BC reasons.
-def reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None, *, outputs=None):
+def reduce(inputs: Sequence[torch.Tensor],
+           output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None,
+           root: int = 0,
+           op: int = SUM,
+           streams: Optional[Sequence[torch.cuda.Stream]] = None,
+           comms=None, *,
+           outputs: Optional[Sequence[torch.Tensor]] = None) -> None:
     _check_sequence_type(inputs)
+    _output: torch.Tensor
     if outputs is not None:
         if output is not None:
             raise ValueError(
@@ -70,30 +78,33 @@ def reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None, *, out
             warnings.warn(
                 "nccl.reduce with an output tensor list is deprecated. "
                 "Please specify a single output tensor with argument 'output' instead instead.")
-            output = outputs[root]
+            _output = outputs[root]
     elif not isinstance(output, torch.Tensor) and isinstance(output, torch._six.container_abcs.Sequence):
         # User called old API with positional arguments of list of output tensors.
         warnings.warn(
             "nccl.reduce with an output tensor list is deprecated. "
             "Please specify a single output tensor.")
-        output = output[root]
-    elif output is None:
-        output = inputs[root]
-    torch._C._nccl_reduce(inputs, output, root, op, streams, comms)
+        _output = output[root]
+    else:
+        _output = inputs[root] if output is None else output
+    torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
 
 
-def broadcast(inputs, root=0, streams=None, comms=None):
+def broadcast(inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     torch._C._nccl_broadcast(inputs, root, streams, comms)
 
 
-def all_gather(inputs, outputs, streams=None, comms=None):
+def all_gather(inputs: Sequence[torch.Tensor], outputs: Sequence[torch.Tensor], streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     _check_sequence_type(outputs)
     torch._C._nccl_all_gather(inputs, outputs, streams, comms)
 
 
-def reduce_scatter(inputs, outputs, op=SUM, streams=None, comms=None):
+def reduce_scatter(inputs: Sequence[torch.Tensor],
+                   outputs: Sequence[torch.Tensor],
+                   op: int = SUM,
+                   streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     _check_sequence_type(outputs)
     torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index adfd2201a046..44b7876c4787 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -6,9 +6,9 @@ def is_available():
     """
     Returns ``True`` if the distributed package is available. Otherwise,
     ``torch.distributed`` does not expose any other APIs. Currently,
-    ``torch.distributed`` is available on Linux and MacOS. Set
+    ``torch.distributed`` is available on Linux, MacOS and Windows. Set
     ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
-    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and
+    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
     ``USE_DISTRIBUTED=0`` for MacOS.
     """
     return hasattr(torch._C, "_c10d_init")
@@ -25,3 +25,7 @@ def is_available():
     # this.
 
     from .distributed_c10d import _backend
+
+    # TODO: remove this once CI issue is resolved
+    # https://github.com/pytorch/pytorch/issues/42517
+    from .distributed_c10d import _P2POp, _batch_isend_irecv
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index 51678fe44590..6b07e23c9476 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -6,24 +6,27 @@
 from torch.nn.parallel import DistributedDataParallel
 
 
-def ddp_comm_hook_wrapper(comm_hook, model, state):
+def _ddp_comm_hook_wrapper(comm_hook, model, state):
     model._register_comm_hook(state, comm_hook)
 
 
 class DDPCommHookType(Enum):
-    '''
+    """
     DDPCommHookType enumerates the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
     as names and ``ddp_comm_hook_wrapper`` partials with hook specified. As an example,
     you can register allreduce hook by
     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
-    '''
-    ALLREDUCE = partial(ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
-    FP16_COMPRESS = partial(ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
+    """
+
+    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+    FP16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+    )
     QUANTIZE_PER_TENSOR = partial(
-        ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
     )
     QUANTIZE_PER_CHANNEL = partial(
-        ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
     )
 
 
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index c7d66f322bb1..a125d8a1204b 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,6 +1,7 @@
 import pickle
 import torch
 import warnings
+import contextlib
 from torch._six import string_classes
 from datetime import timedelta
 
@@ -159,8 +160,7 @@ class GroupMember(object):
 
 def _rank_not_in_group(group):
     """
-    Helper that checks if the current process's rank is not in a given group
-
+    Helper that checks if the current process's rank is not in a given group.
     """
     if group == GroupMember.WORLD:
         return False
@@ -170,8 +170,7 @@ def _rank_not_in_group(group):
 def _get_group_rank(group, rank):
     """
     Helper that gets a given group's local rank in the group from a given global
-    rank
-
+    rank.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -188,8 +187,7 @@ def _get_group_rank(group, rank):
 def _get_global_rank(group, group_rank):
     """
     Helper that gets a given group's global rank from a given local rank in the
-    group
-
+    group.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -204,8 +202,7 @@ def _get_global_rank(group, group_rank):
 def _check_default_pg():
     """
     Helper that checks if the default ProcessGroup has been initialized, with
-    assertion
-
+    assertion.
     """
     assert _default_pg is not None, \
         "Default process group is not initialized"
@@ -213,8 +210,7 @@ def _check_default_pg():
 
 def _get_group_size(group):
     """
-    Helper that gets a given group's world size
-
+    Helper that gets a given group's world size.
     """
     if group is GroupMember.WORLD:
         _check_default_pg()
@@ -227,7 +223,6 @@ def _get_group_size(group):
 def _check_single_tensor(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a single tensor.
-
     """
     if not isinstance(param, torch.Tensor):
         raise RuntimeError("Invalid function argument. Expected parameter `{}` "
@@ -237,7 +232,6 @@ def _check_single_tensor(param, param_name):
 def _check_tensor_list(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a list of tensors.
-
     """
     if not isinstance(param, list) or \
        not all(isinstance(p, torch.Tensor) for p in param):
@@ -245,10 +239,34 @@ def _check_tensor_list(param, param_name):
                            "to be of type List[torch.Tensor].".format(param_name))
 
 
+def _check_op(op):
+    """
+    Helper to check that the ``op`` is either isend or irecv.
+    """
+    if op not in [isend, irecv]:
+        raise RuntimeError("Invalid ``op``. Expected ``op`` "
+                           "to be of type ``torch.distributed.isend`` or "
+                           "``torch.distributed.irecv``.")
+
+def _check_p2p_op_list(p2p_op_list):
+    """
+    Helper to check that the ``p2p_op_list`` is a list of _P2POp instances and
+    all ops use the same backend.
+    """
+    if not isinstance(p2p_op_list, list) or \
+       not all(isinstance(p2p_op, _P2POp) for p2p_op in p2p_op_list):
+        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
+                           "to be of type ``torch.distributed._P2POp``.")
+
+
+    backend = get_backend(p2p_op_list[0].group)
+    if not all(backend == get_backend(p2p_op.group) for p2p_op in p2p_op_list):
+        raise RuntimeError("All groups need to use the same backend.")
+
+
 def is_mpi_available():
     """
     Checks if the MPI backend is available.
-
     """
     return _MPI_AVAILABLE
 
@@ -256,7 +274,6 @@ def is_mpi_available():
 def is_nccl_available():
     """
     Checks if the NCCL backend is available.
-
     """
     return _NCCL_AVAILABLE
 
@@ -264,7 +281,6 @@ def is_nccl_available():
 def is_gloo_available():
     """
     Checks if the Gloo backend is available.
-
     """
     return _GLOO_AVAILABLE
 
@@ -272,7 +288,6 @@ def is_gloo_available():
 def is_initialized():
     """
     Checking if the default process group has been initialized
-
     """
     return _default_pg is not None
 
@@ -280,7 +295,6 @@ def is_initialized():
 def _get_default_group():
     """
     Getting the default process group created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -291,7 +305,6 @@ def _get_default_group():
 def _get_default_store():
     """
     Getting the default store created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -436,6 +449,10 @@ def init_process_group(backend,
     _backend = _pg_map[_default_pg][0]
     _default_pg_init_method = init_method
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
 
 def _new_process_group_helper(world_size,
                               rank,
@@ -753,6 +770,94 @@ def recv(tensor,
         return src
 
 
+class _P2POp(object):
+    """
+    A class to build point-to-point operations for ``_batch_isend_irecv``.
+
+    This class builds the type of P2P operation, communication buffer, peer rank,
+    Process Group group, and tag. Instances of this class will be passed to
+    ``_batch_isend_irecv`` for point-to-point communications.
+
+    Arguments:
+        op (callable): A function to send data to or receive data from a peer process.
+            The type of ``op`` is either ``torch.distributed.isend`` or
+            ``torch.distributed.irecv``.
+        tensor (Tensor): Tensor to send or receive.
+        peer (int): Destination or source rank.
+        group (ProcessGroup, optional): The process group to work on.
+        tag (int, optional): Tag to match send with recv.
+    """
+    def __init__(self, op, tensor, peer, group=group.WORLD, tag=0):
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = group
+        self.tag = tag
+
+    def __new__(cls, op, tensor, peer, group=group.WORLD, tag=0):
+        _check_op(op)
+        _check_single_tensor(tensor, "tensor")
+        return object.__new__(cls)
+
+
+@contextlib.contextmanager
+def _batch_p2p_manager(backend):
+    if backend == Backend.NCCL:
+        ProcessGroupNCCL._group_start()
+    try:
+        yield
+    finally:
+        if backend == Backend.NCCL:
+            ProcessGroupNCCL._group_end()
+
+
+def _batch_isend_irecv(p2p_op_list):
+    """
+    Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+    Process each of the operations in p2p_op_list and return the corresponding
+    requests. NCCL and Gloo backend are currently supported.
+
+    Arguments:
+        p2p_op_list: A list of point-to-point operations(type of each operator is
+            ``torch.distributed._P2POp``). The order of the isend/irecv in the list
+            matters and it needs to match with corresponding isend/irecv on the
+            remote end.
+
+    Returns:
+        A list of distributed request objects returned by calling the corresponding
+        op in the op_list.
+
+    Examples:
+        >>> send_tensor = torch.arange(2) + 2 * rank
+        >>> recv_tensor = torch.randn(2)
+        >>> send_op = dist._P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
+        >>> recv_op = dist._P2POp(dist.irecv, recv_tensor, (rank + 1)%world_size)
+        >>> reqs = _batch_isend_irecv([send_op, recv_op])
+        >>> for req in reqs:
+        >>>     req.wait()
+        >>> recv_tensor
+        tensor([2, 3])     # Rank 0
+        tensor([0, 1])     # Rank 1
+    """
+    _check_p2p_op_list(p2p_op_list)
+    backend = get_backend(p2p_op_list[0].group)
+    reqs = []
+    with _batch_p2p_manager(backend):
+        for p2p_op in p2p_op_list:
+            op = p2p_op.op
+            tensor = p2p_op.tensor
+            peer = p2p_op.peer
+            curr_group = p2p_op.group
+            tag = p2p_op.tag
+
+            ret = op(tensor, peer, curr_group, tag)
+
+            if ret is not None:
+                reqs.append(ret)
+    return reqs
+
+
 def broadcast_multigpu(tensor_list,
                        src,
                        group=group.WORLD,
@@ -2025,4 +2130,9 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
         for group_rank, global_rank in enumerate(ranks)
     }
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
+
     return pg
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
new file mode 100644
index 000000000000..cafce79a8c8e
--- /dev/null
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -0,0 +1,90 @@
+from typing import List, Dict, Optional
+import torch
+import torch.optim.functional as F
+
+from torch import Tensor
+
+# Define a TorchScript compatible Functional Adagrad Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly let the user pass gradients to the `step` function
+# this is so that we could separate the gradients and parameters
+# and allow multithreaded trainer to update the parameters
+# without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdagrad(object):
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        lr_decay: float = 0.0,
+        weight_decay: float = 0.0,
+        initial_accumulator_value: float = 0.0,
+        warmup_lr_multiplier: float = 1.0,
+        warmup_num_iters: float = 0.0,
+        eps: float = 1e-10,
+        coalesce_grad: bool = True,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "lr_decay": lr_decay,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "initial_accumulator_value": initial_accumulator_value,
+            "warmup_lr_multiplier": warmup_lr_multiplier,
+            "warmup_num_iters": warmup_num_iters,
+        }
+        self.coalesce_grad = coalesce_grad
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        # TODO: no union or any types in TorchScript, make step a scalar tensor instead
+        # This is also needed by if we want to share_memory on the step across processes
+        for p in self.param_group["params"]:
+            self.state[p] = {
+                "sum": torch.full_like(p.data, initial_accumulator_value),
+                "step": torch.tensor(0.0),
+            }
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group['params']
+        params_with_grad = []
+        grads = []
+        state_sums = []
+        state_steps: List[int] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(self.param_group['params'], gradients):
+            if gradient is not None:
+                params_with_grad.append(param)
+                grads.append(gradient)
+                state = self.state[param]
+                state_sums.append(state['sum'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'].item())
+
+        with torch.no_grad():
+            F.adagrad(params,
+                      grads,
+                      state_sums,
+                      state_steps,
+                      self.defaults['lr'],
+                      self.defaults['weight_decay'],
+                      self.defaults['lr_decay'],
+                      self.defaults['eps'])
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 153d229e3bf3..bb04e2dde3aa 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -1,11 +1,55 @@
+from typing import List, Optional
+
 import torch.distributed.rpc as rpc
+import torch.optim as optim
+import torch.jit as jit
+from torch import Tensor
+from torch.distributed.rpc import RRef
+from .functional_adagrad import _FunctionalAdagrad
 import torch.distributed.autograd as dist_autograd
 
+
 from collections import defaultdict
 from threading import Lock
 
 
-class _LocalOptimizer:
+# XXX: we define a _ScriptModuleOptimizer here to explicitly
+# compile the FunctionalOptimizer class into TorchScript
+# This is because ScriptClass instance still lives in
+# python unless you explictly compile it as an attribute
+# in ScriptModule or pass it to a ScriptFunction
+# _ScriptLocalOptimizerInterface serves as a common
+# interface type for Optimizer ScriptModules.
+# 
+# TODO (wanchaol): remove this once we added TorchScript
+# class reference semantics
+@jit.interface
+class _ScriptLocalOptimizerInterface(object):
+    def step(self, autograd_ctx_id: int) -> None:
+        pass
+
+class _ScriptLocalOptimizer(jit.ScriptModule):
+    def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        super().__init__()
+        self._local_params = [rref.local_value() for rref in local_params_rref]
+        self.optim = optim_cls(
+            self._local_params,
+            *args,
+            **kwargs)
+
+    @jit.script_method
+    def step(self, autograd_ctx_id: int):
+        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
+        # apply functional optimizer step with a list of gradients
+        grads: List[Optional[Tensor]] = [
+            all_local_grads[p] if p in all_local_grads else None
+            for p in self._local_params
+        ]
+
+        self.optim.step(grads)
+
+
+class _LocalOptimizer(object):
     # Ideally we would only need to share a lock for instances of
     # _LocalOptimizer that deal with the same parameters. We are
     # making a simplifying assumption here that if there is more
@@ -16,8 +60,9 @@ class _LocalOptimizer:
     global_lock = Lock()
 
     def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        self._local_params = [rref.local_value() for rref in local_params_rref]
         self.optim = optim_cls(
-            [rref.local_value() for rref in local_params_rref],
+            self._local_params,
             *args,
             **kwargs)
 
@@ -40,6 +85,19 @@ def _local_optimizer_step(local_optim_rref, autograd_ctx_id):
     local_optim.step(autograd_ctx_id)
 
 
+# new/step functions combined with _ScriptLocalOptimizer to provide GIL-free optimizer
+def _new_script_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
+    return rpc.RRef(
+        _ScriptLocalOptimizer(optim_cls, local_params_rref, *args, **kwargs), _ScriptLocalOptimizerInterface)
+
+@jit.script
+def _script_local_optimizer_step(
+    local_optim_rref: RRef[_ScriptLocalOptimizerInterface],
+    autograd_ctx_id: int
+) -> None:
+    local_optim = local_optim_rref.local_value()
+    local_optim.step(autograd_ctx_id)
+
 def _wait_for_all(rpc_futs):
     # TODO: improve error propagation
     exception = None
@@ -104,17 +162,34 @@ class DistributedOptimizer:
         >>>   )
         >>>   dist_optim.step(context_id)
     """
+
+    # dict to map a user passed in optimizer_class to a functional
+    # optimizer class if we have already defined inside the
+    # distributed.optim package, this is so that we hide the
+    # functional optimizer to user and still provide the same API.
+    functional_optim_map = {
+        optim.Adagrad: _FunctionalAdagrad,
+    }
+
     def __init__(self, optimizer_class, params_rref, *args, **kwargs):
         per_worker_params_rref = defaultdict(list)
         for param in params_rref:
             per_worker_params_rref[param.owner()].append(param)
 
+        optim_ctor = DistributedOptimizer.functional_optim_map.get(optimizer_class, optimizer_class)
+        self.is_functional_optim = (optim_ctor != optimizer_class)
+
+        if self.is_functional_optim:
+            optimizer_new_func = _new_script_local_optimizer
+        else:
+            optimizer_new_func = _new_local_optimizer
+
         remote_optim_futs = []
         for worker, param_rrefs in per_worker_params_rref.items():
             remote_optim_rref_fut = rpc.rpc_async(
                 worker,
-                _new_local_optimizer,
-                args=(optimizer_class, param_rrefs) + args,
+                optimizer_new_func,
+                args=(optim_ctor, param_rrefs) + args,
                 kwargs=kwargs,
             )
             remote_optim_futs.append(remote_optim_rref_fut)
@@ -136,11 +211,17 @@ def step(self, context_id):
                 optimizer step.
         """
         dist_autograd._is_valid_context(context_id)
+
+        if self.is_functional_optim:
+            optimizer_step_func = _script_local_optimizer_step
+        else:
+            optimizer_step_func = _local_optimizer_step
+
         rpc_futs = []
-        for optim in self.remote_optimizers:
+        for optimizer in self.remote_optimizers:
             rpc_futs.append(rpc.rpc_async(
-                optim.owner(),
-                _local_optimizer_step,
-                args=(optim, context_id),
+                optimizer.owner(),
+                optimizer_step_func,
+                args=(optimizer, context_id),
             ))
         _wait_for_all(rpc_futs)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 292634580aab..4545aea2bf56 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -6,9 +6,12 @@
 import torch._six as six
 import numbers
 import os
-from . import FileStore, TCPStore
+import sys
+from . import FileStore
 from .constants import default_pg_timeout
 
+if sys.platform != 'win32':
+    from . import TCPStore
 
 _rendezvous_handlers = {}
 
@@ -90,6 +93,10 @@ def _error(msg):
 
     result = urlparse(url)
     path = result.path
+    if sys.platform == 'win32':
+        import urllib.request
+        path = urllib.request.url2pathname(result.path)
+
     if not path:
         raise _error("path missing")
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
@@ -175,7 +182,8 @@ def _env_error(var):
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using env:// method")
 
+if sys.platform != 'win32':
+    register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+    register_rendezvous_handler("env", _env_rendezvous_handler)
 
 register_rendezvous_handler("file", _file_rendezvous_handler)
-register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
-register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 4f1180bf954f..2c579bcc8fe9 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -1,7 +1,12 @@
+import logging
+import threading
 
 import torch
 import torch.distributed as dist
-import threading
+
+
+logger = logging.getLogger(__name__)
+
 
 _init_counter = 0
 _init_counter_lock = threading.Lock()
@@ -14,8 +19,6 @@ def is_available():
     raise RuntimeError("Failed to initialize torch.distributed.rpc")
 
 
-
-
 if is_available():
     from . import api, backend_registry, functions, _set_profiler_node_id
     from . import (
@@ -36,7 +39,7 @@ def is_available():
 
     def init_rpc(
         name,
-        backend=BackendType.TENSORPIPE,
+        backend=None,
         rank=-1,
         world_size=None,
         rpc_backend_options=None,
@@ -71,7 +74,55 @@ def init_rpc(
                 are available.
         """
 
-        if not rpc_backend_options:
+        if backend is not None and not isinstance(backend, backend_registry.BackendType):
+            raise TypeError(
+                "Argument backend must be a member of BackendType"
+            )
+
+        if rpc_backend_options is not None and not isinstance(rpc_backend_options, RpcBackendOptions):
+            raise TypeError(
+                "Argument rpc_backend_options must be an instance of RpcBackendOptions"
+            )
+
+        # To avoid breaking users that passed a ProcessGroupRpcBackendOptions
+        # without specifying the backend as PROCESS_GROUP when that was the
+        # default, we try to detect the backend from the options when only the
+        # latter is passed.
+        if backend is None and rpc_backend_options is not None:
+            for candidate_backend in BackendType:
+                if isinstance(
+                    rpc_backend_options,
+                    type(
+                        backend_registry.construct_rpc_backend_options(
+                            candidate_backend
+                        )
+                    ),
+                ):
+                    backend = candidate_backend
+                    break
+            else:
+                raise TypeError(
+                    f"Could not infer backend for options {rpc_backend_options}"
+                )
+            if backend != BackendType.TENSORPIPE:
+                logger.warning(
+                    f"RPC was initialized with no explicit backend but with options "
+                    f"corresponding to {backend}, hence that backend will be used "
+                    f"instead of the default {BackendType.TENSORPIPE}. To silence this "
+                    f"warning pass `backend={backend}` explicitly."
+                )
+
+        if backend is None:
+            backend = BackendType.TENSORPIPE
+
+        if backend == BackendType.PROCESS_GROUP:
+            logger.warning(
+                "RPC was initialized with the PROCESS_GROUP backend which is "
+                "deprecated and slated to be removed and superseded by the TENSORPIPE "
+                "backend. It is recommended to migrate to the TENSORPIPE backend."
+            )
+
+        if rpc_backend_options is None:
             # default construct a set of RPC backend options.
             rpc_backend_options = backend_registry.construct_rpc_backend_options(
                 backend
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index af28e6023c60..d1b62a5b0ab4 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -12,6 +12,7 @@
     PyRRef,
     RemoteProfilerManager,
     WorkerInfo,
+    get_rpc_timeout,
     _cleanup_python_rpc_handler,
     _delete_all_user_and_unforked_owner_rrefs,
     _destroy_rref_context,
@@ -34,7 +35,7 @@
     _build_rpc_profiling_key,
 )
 
-from .constants import UNSET_RPC_TIMEOUT
+from .constants import DEFAULT_SHUTDOWN_TIMEOUT, UNSET_RPC_TIMEOUT
 
 
 logger = logging.getLogger(__name__)
@@ -142,7 +143,7 @@ def _broadcast_to_followers(sequence_id, objects_map):
 
 
 @_require_initialized
-def _all_gather(obj):
+def _all_gather(obj, timeout=UNSET_RPC_TIMEOUT):
     r"""
     This is similar to torch.distributed.all_gather(), but is using RPC. It
     picks the worker with the smallest name (alphabetic order) as the leader.
@@ -163,8 +164,8 @@ def _all_gather(obj):
         _all_gather_sequence_id += 1
 
     is_leader = leader_name == self_name
-    # Set a long enough timeout for all shutdown messages to be processed.
-    timeout = 5  # second
+    if timeout == UNSET_RPC_TIMEOUT:
+        timeout = get_rpc_timeout()
 
     # Phase 1: Followers send it's object to the leader
     if is_leader:
@@ -178,9 +179,7 @@ def _all_gather(obj):
         )
 
     with _all_gather_dict_lock:
-        states = _all_gather_sequence_id_to_states[
-            sequence_id
-        ]
+        states = _all_gather_sequence_id_to_states[sequence_id]
     states.proceed_signal.wait()
 
     # Phase 2: Leader broadcast gathered results to all followers
@@ -207,7 +206,7 @@ def _all_gather(obj):
         if errors:
             raise RuntimeError(
                 f"Followers {[e[0] for e in errors]} timed out in _all_gather "
-                f"after {timeout} seconds. The first exception is {errors[0][1]}"
+                f"after {timeout:.2f} seconds. The first exception is {errors[0][1]}"
             )
 
     return states.gathered_objects
@@ -223,7 +222,7 @@ def _wait_all_workers():
     framework will work after this method returns.
     """
     try:
-        _all_gather(None)
+        _all_gather(None, timeout=DEFAULT_SHUTDOWN_TIMEOUT)
     except RuntimeError as ex:
         logger.error(
             f"Failed to respond to 'Shutdown Proceed' in time, got error {ex}"
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 8ca185ab1ff1..6dac7cb0863a 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -134,8 +134,21 @@ def _init_process_group(store, rank, world_size):
 def _process_group_init_backend_handler(
     store, name, rank, world_size, rpc_backend_options
 ):
+    from . import ProcessGroupRpcBackendOptions
     from . import ProcessGroupAgent
 
+    if not isinstance(store, dist.Store):
+        raise TypeError("`store` must be a c10d::Store. {}".format(store))
+
+    if not isinstance(
+        rpc_backend_options, ProcessGroupRpcBackendOptions
+    ):
+        raise TypeError(
+            "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`. {}".format(
+                rpc_backend_options
+            )
+        )
+
     group = _init_process_group(store, rank, world_size)
 
     # TODO: add try-except and destroy _agent in all processes if any fails.
diff --git a/torch/distributed/rpc/constants.py b/torch/distributed/rpc/constants.py
index ecd9552ce40b..c2dd804e4c81 100644
--- a/torch/distributed/rpc/constants.py
+++ b/torch/distributed/rpc/constants.py
@@ -12,6 +12,7 @@
 # For any RpcAgent.
 DEFAULT_RPC_TIMEOUT_SEC = _DEFAULT_RPC_TIMEOUT_SEC
 DEFAULT_INIT_METHOD = _DEFAULT_INIT_METHOD
+DEFAULT_SHUTDOWN_TIMEOUT = 5.0
 
 # For ProcessGroupAgent.
 DEFAULT_NUM_SEND_RECV_THREADS = _DEFAULT_NUM_SEND_RECV_THREADS
diff --git a/torch/distributed/rpc/functions.py b/torch/distributed/rpc/functions.py
index 5c807741f5f9..d761f7b4046b 100644
--- a/torch/distributed/rpc/functions.py
+++ b/torch/distributed/rpc/functions.py
@@ -17,8 +17,9 @@ def async_execution(fn):
     :meth:`~torch.distributed.rpc.rpc_async` or waiting for other signals.
 
     .. note:: To enable asynchronous execution, applications must pass the
-        function object returned by this decorator to RPC APIs. Otherwise, RPC
-        will not be able to detect the attributes installed by this decorator.
+        function object returned by this decorator to RPC APIs. If RPC detected
+        attributes installed by this decorator, it knows that this function
+        returns a ``Future`` object and will handle that accordingly.
         However, this does not mean this decorator has to be outmost one when
         defining a function. For example, when combined with ``@staticmethod``
         or ``@classmethod``, ``@rpc.functions.async_execution`` needs to be the
@@ -27,14 +28,14 @@ def async_execution(fn):
         because, when accessed, the static or class method preserves attributes
         installed by ``@rpc.functions.async_execution``.
 
-    .. warning:: `autograd profiler <https://pytorch.org/docs/stable/autograd.html#profiler>`_
-        does not work with ``async_execution`` functions.
 
     Example::
         The returned :class:`~torch.futures.Future` object can come from
-        ``rpc.rpc_async``, ``Future.then(cb)``, or :class:`~torch.futures.Future`
+        :meth:`~torch.distributed.rpc.rpc_async`,
+        :meth:`~torch.futures.Future.then`, or :class:`~torch.futures.Future`
         constructor. The example below shows directly using the
-        :class:`~torch.futures.Future` returned by ``Future.then(cb)``.
+        :class:`~torch.futures.Future` returned by
+        :meth:`~torch.futures.Future.then`.
 
         >>> from torch.distributed import rpc
         >>>
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 6114f66ce6cb..0f4ba8d53817 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -113,8 +113,10 @@ def __enter__(self):
             else torch.autograd.ProfilerState.CPU
         )
         profiler_config = torch.autograd.ProfilerConfig(
-            profiler_kind, self.record_shapes, self.profile_memory
-        )
+            profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            False)
         _enable_server_process_global_profiler(profiler_config)
         return self
 
@@ -143,7 +145,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         process_global_function_events = []
         for thread_local_events in process_global_events:
             # Parse from ``Event``s to ``FunctionEvent``s.
-            thread_local_function_events = torch.autograd.profiler.parse_cpu_trace(
+            thread_local_function_events = torch.autograd.profiler.parse_event_records(
                 thread_local_events
             )
             thread_local_function_events.sort(
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 01f3dd520174..319d2dd01b66 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -16,14 +16,14 @@ class Categorical(Distribution):
 
     Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is ``probs.size(-1)``.
 
-    If :attr:`probs` is 1D with length-`K`, each element is the relative
+    If :attr:`probs` is 1-dimensional with length-`K`, each element is the relative
     probability of sampling the class at that index.
 
-    If :attr:`probs` is 2D, it is treated as a batch of relative probability
-    vectors.
+    If :attr:`probs` is N-dimensional, the first N-1 dimensions are treated as a batch of 
+    relative probability vectors.
 
     .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
-              and it will be normalized to sum to 1.
+              and it will be normalized to sum to 1 along the last dimension.
 
     See also: :func:`torch.multinomial`
 
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 7395635971de..051725db19ca 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -10,14 +10,14 @@ class NegativeBinomial(Distribution):
     Creates a Negative Binomial distribution, i.e. distribution
     of the number of successful independent and identical Bernoulli trials
     before :attr:`total_count` failures are achieved. The probability
-    of success of each Bernoulli trial is :attr:`probs`.
+    of failure of each Bernoulli trial is :attr:`probs`.
 
     Args:
         total_count (float or Tensor): non-negative number of negative Bernoulli
             trials to stop, although the distribution is still valid for real
             valued count
-        probs (Tensor): Event probabilities of success in the half open interval [0, 1)
-        logits (Tensor): Event log-odds for probabilities of success
+        probs (Tensor): Event probabilities of failure in the half open interval [0, 1)
+        logits (Tensor): Event log-odds for probabilities of failure
     """
     arg_constraints = {'total_count': constraints.greater_than_eq(0),
                        'probs': constraints.half_open_interval(0., 1.),
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 08b5d28b05ae..b3ffdb24b4a0 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -2,6 +2,12 @@
 
 import torch
 from torch._C import _add_docstr, _fft  # type: ignore
+from torch._torch_docs import factory_common_args
+
+__all__ = ['fft', 'ifft', 'fftn', 'ifftn',
+           'rfft', 'irfft', 'rfftn', 'irfftn', 'hfft', 'ihfft',
+           'fftfreq', 'rfftfreq', 'fftshift', 'ifftshift',
+           'Tensor']
 
 Tensor = torch.Tensor
 
@@ -87,6 +93,101 @@
     tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j])
 """)
 
+fftn = _add_docstr(_fft.fft_fftn, r"""
+fftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N dimensional discrete Fourier transform of :attr:`input`.
+
+Note:
+
+    The Fourier domain representation of any real signal satisfies the
+    Hermitian property: ``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])``. This
+    function always returns all positive and negative frequency terms even
+    though, for real inputs, half of these values are redundant.
+    :func:`~torch.fft.rfftn` returns the more compact one-sided representation
+    where only the positive frequencies of the last dimension are returned.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.fftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ifftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n``
+        between the two transforms. This is required to make
+        :func:`~torch.fft.ifftn` the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Example:
+
+    >>> import torch.fft
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> fftn = torch.fft.fftn(t)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.fftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.fft` calls:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
+    >>> torch.allclose(fftn, two_ffts)
+
+""")
+
+ifftn = _add_docstr(_fft.fft_ifftn, r"""
+ifftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N dimensional inverse discrete Fourier transform of :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ifftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.fftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Example:
+
+    >>> import torch.fft
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> ifftn = torch.fft.ifftn(t)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ifftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.ifft` calls:
+
+    >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
+    >>> torch.allclose(ifftn, two_iffts)
+
+""")
+
 rfft = _add_docstr(_fft.fft_rfft, r"""
 rfft(input, n=None, dim=-1, norm=None) -> Tensor
 
@@ -199,6 +300,136 @@
     tensor([0.0000, 1.0000, 2.0000, 3.0000, 4.0000])
 """)
 
+rfftn = _add_docstr(_fft.fft_rfftn, r"""
+rfftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N-dimensional discrete Fourier transform of real :attr:`input`.
+
+The FFT of a real signal is Hermitian-symmetric,
+``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])`` so the full
+:func:`~torch.fft.fftn` output contains redundant information.
+:func:`~torch.fft.rfftn` instead omits the negative frequencies in the
+last dimension.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.rfftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.irfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Example:
+
+    >>> import torch.fft
+    >>> t = torch.rand(10, 10)
+    >>> rfftn = torch.fft.rfftn(t)
+    >>> rfftn.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.fftn`, we have all
+    elements up to the Nyquist frequency.
+
+    >>> fftn = torch.fft.fftn(t)
+    >>> torch.allclose(fftn[..., :6], rfftn)
+    True
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.rfftn`
+    here is equivalent to a combination of :func:`~torch.fft.fft` and
+    :func:`~torch.fft.rfft`:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.rfft(x, dim=1), dim=0)
+    >>> torch.allclose(rfftn, two_ffts)
+
+""")
+
+irfftn = _add_docstr(_fft.fft_irfftn, r"""
+irfftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.rfftn`.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier
+domain, as produced by :func:`~torch.fft.rfftn`. By the Hermitian property, the
+output will be real-valued.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`s`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal shape :attr:`s`.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.irfftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.rfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Example:
+
+    >>> import torch.fft
+    >>> t = torch.rand(10, 9)
+    >>> T = torch.fft.rfftn(t)
+
+    Without specifying the output length to :func:`~torch.fft.irfft`, the output
+    will not round-trip properly because the input is odd-length in the last
+    dimension:
+
+    >>> torch.fft.irfftn(T).size()
+    torch.Size([10, 10])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.irfftn(T, t.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.allclose(roundtrip, t)
+    True
+
+""")
+
 hfft = _add_docstr(_fft.fft_hfft, r"""
 hfft(input, n=None, dim=-1, norm=None) -> Tensor
 
@@ -324,3 +555,180 @@
     tensor([ 2.0000+-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j,
         -0.5000+0.6882j])
 """)
+
+fftfreq = _add_docstr(_fft.fft_fftfreq, r"""
+fftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the discrete Fourier Transform sample frequencies for a signal of size :attr:`n`.
+
+Note:
+    By convention, :func:`~torch.fft.fft` returns positive frequency terms
+    first, followed by the negative frequencies in reverse order, so that
+    ``f[-i]`` for all :math:`0 < i \leq n/2`` in Python gives the negative
+    frequency terms. For an FFT of length :attr:`n` and with inputs spaced in
+    length unit :attr:`d`, the frequencies are::
+
+        f = [0, 1, ..., (n - 1) // 2, -(n // 2), ..., -1] / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftfreq` follows NumPy's
+    convention of taking it to be negative.
+
+Args:
+    n (int): the FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.fftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    For even input, we can see the Nyquist frequency at ``f[2]`` is given as
+    negative:
+
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+rfftfreq = _add_docstr(_fft.fft_rfftfreq, r"""
+rfftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the sample frequencies for :func:`~torch.fft.rfft` with a signal of size :attr:`n`.
+
+Note:
+    :func:`~torch.fft.rfft` returns Hermitian one-sided output, so only the
+    positive frequency terms are returned. For a real FFT of length :attr:`n`
+    and with inputs spaced in length unit :attr:`d`, the frequencies are::
+
+        f = torch.arange((n + 1) // 2) / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. Unlike :func:`~torch.fft.fftfreq`,
+    :func:`~torch.fft.rfftfreq` always returns it as positive.
+
+Args:
+    n (int): the real FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.rfftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000])
+
+    >>> torch.fft.rfftfreq(4)
+    tensor([ 0.0000,  0.2500, 0.5000])
+
+    Compared to the output from :func:`~torch.fft.fftfreq`, we see that the
+    Nyquist frequency at ``f[2]`` has changed sign:
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+fftshift = _add_docstr(_fft.fft_fftshift, r"""
+fftshift(input, dim=None) -> Tensor
+
+Reorders n-dimensional FFT data, as provided by :func:`~torch.fft.fftn`, to have
+negative frequency terms first.
+
+Note:
+    By convention, the FFT returns positive frequency terms first, followed by
+    the negative frequencies in reverse order, so that ``f[-i]`` for all
+    :math:`0 < i \leq n/2` in Python gives the negative frequency terms.
+    :func:`~torch.fft.fftshift` rearranges all frequencies into ascending order
+    from negative to positive with the zero-frequency term in the center.
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftshift` always puts the
+    Nyquist term at the 0-index. This is the same convention used by
+    :func:`~torch.fft.fftfreq`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(4)
+    >>> f
+    tensor([ 0.0000,  0.2500,  -0.5000, -0.2500])
+
+    >>> torch.fftshift(f)
+    tensor([-0.5000, -0.2500, 0.0000, 0.2500])
+
+    Also notice that the Nyquist frequency term at ``f[2]`` was moved to the
+    beginning of the tensor.
+
+    This also works for multi-dimensional transforms:
+    >>> x = torch.fft.fftfreq(5, d=1/5) + 0.1 * torch.fft.fftfreq(5, d=1/5).unsqueeze(1)
+    >>> x
+    tensor([[ 0.0000,  1.0000,  2.0000, -2.0000, -1.0000],
+            [ 0.1000,  1.1000,  2.1000, -1.9000, -0.9000],
+            [ 0.2000,  1.2000,  2.2000, -1.8000, -0.8000],
+            [-0.2000,  0.8000,  1.8000, -2.2000, -1.2000],
+            [-0.1000,  0.9000,  1.9000, -2.1000, -1.1000]])
+
+    >>> torch.fft.fftshift(x)
+    tensor([[-2.2000, -1.2000, -0.2000,  0.8000,  1.8000],
+            [-2.1000, -1.1000, -0.1000,  0.9000,  1.9000],
+            [-2.0000, -1.0000,  0.0000,  1.0000,  2.0000],
+            [-1.9000, -0.9000,  0.1000,  1.1000,  2.1000],
+            [-1.8000, -0.8000,  0.2000,  1.2000,  2.2000]])
+
+""")
+
+ifftshift = _add_docstr(_fft.fft_ifftshift, r"""
+ifftshift(input, dim=None) -> Tensor
+
+Inverse of :func:`~torch.fft.fftshift`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(5)
+    >>> f
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    A round-trip through :func:`~torch.fft.fftshift` and
+    :func:`~torch.fft.ifftshift` gives the same result:
+
+    >>> shifted = torch.fftshift(f)
+    >>> torch.ifftshift(shifted)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+""")
diff --git a/torch/functional.py b/torch/functional.py
index 84dbc2c5a4b7..1a72aaf18a30 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -9,7 +9,6 @@
 from .overrides import has_torch_function, handle_torch_function
 from ._jit_internal import boolean_dispatch, List
 from ._jit_internal import _overload as overload
-import warnings
 
 Tensor = torch.Tensor
 from torch import _VF
@@ -399,9 +398,14 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
          return_complex: Optional[bool] = None) -> Tensor:
     r"""Short-time Fourier transform (STFT).
 
+    .. warning::
+        Setting :attr:`return_complex` explicitly will be required in a future
+        PyTorch release. Set it to False to preserve the current behavior or
+        True to return a complex output.
+
     The STFT computes the Fourier transform of short overlapping windows of the
     input. This giving frequency components of the signal as they change over
-    time. The interface of this function is modeled after librosa_.
+    time. The interface of this function is modeled after the librosa_ stft function.
 
     .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html
 
@@ -457,10 +461,6 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
       the output is a ``input.dim() + 2`` dimensional real tensor where the last
       dimension represents the real and imaginary components.
 
-      .. warning::
-         From pytorch 1.8.0, :attr:`return_complex` will default to ``True``
-         for all input types.
-
     Returns either a complex tensor of size :math:`(* \times N \times T)` if
     :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
     \times T \times 2)`. Where :math:`*` is the optional batch size of
@@ -884,7 +884,7 @@ def tensordot(a, b, dims=2):
     Args:
       a (Tensor): Left tensor to contract
       b (Tensor): Right tensor to contract
-      dims (int or tuple of two lists of integers): number of dimensions to
+      dims (int or Tuple[List[int]] containing two lists): number of dimensions to
          contract or explicit lists of dimensions for :attr:`a` and
          :attr:`b` respectively
 
@@ -919,6 +919,12 @@ def tensordot(a, b, dims=2):
                 [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                 [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])
 
+        >>> a = torch.randn(3, 5, 4, 6)
+        >>> b = torch.randn(6, 4, 5, 3)
+        >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0]))
+        tensor([[  7.7193,  -2.4867, -10.3204],
+                [  1.5513, -14.4737,  -6.5113],
+                [ -0.2850,   4.2573,  -3.5997]])
     """
     if not torch.jit.is_scripting():
         if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
@@ -1214,7 +1220,9 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
     .. warning::
 
         torch.norm is deprecated and may be removed in a future PyTorch release.
-        Use :func:`torch.linalg.norm` instead.
+        Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm`
+        has a different signature and slightly different behavior that is
+        more consistent with NumPy's numpy.linalg.norm.
 
     Args:
         input (Tensor): the input tensor
@@ -1273,9 +1281,6 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
         (tensor(3.7417), tensor(11.2250))
     """
-    warnings.warn((
-        "torch.norm is deprecated and may be removed in a future PyTorch release. "
-        "Use torch.linalg.norm instead."))
 
     if not torch.jit.is_scripting():
         if type(input) is not Tensor and has_torch_function((input,)):
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 185511460740..792a905432a5 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -9,7 +9,7 @@
 
 ```
 import torch
-from torch.fx import GraphModule
+from torch.fx import symbolic_trace
 
 class MyModule(torch.nn.Module):
     def __init__(self):
@@ -27,27 +27,25 @@ def forward(self, x):
 The Intermediate Representation centers around a 5-opcode format:
 
 ```
-from tabulate import tabulate
-node_specs = [[n.op, n.name, n.target, n.args, n.kwargs] for n in gm.graph.nodes]
-print(tabulate(node_specs, headers=['opcode', 'name', 'target', 'args', 'kwargs']))
+print(gm.graph)
 ```
 
 ```
-opcode         name           target                                                   args                kwargs
--------------  -------------  -------------------------------------------------------  ------------------  -----------
-placeholder    x              x                                                        ()                  {}
-get_attr       linear_weight  linear.weight                                            ()                  {}
-call_function  add_1          <built-in function add>                                  (x, linear_weight)  {}
-call_module    linear_1       linear                                                   (add_1,)            {}
-call_method    relu_2         relu                                                     [linear_1]          {}
-call_function  sum_1          <built-in method sum of type object at 0x7f1c29dd36e0>   (relu_2,)           {'dim': -1}
-call_function  topk_1         <built-in method topk of type object at 0x7f1c29dd36e0>  (sum_1, 3)          {}
+graph(x):
+    %linear_weight : [uses=1] = self.linear.weight
+    %add_1 : [uses=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
+    %linear_1 : [uses=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
+    %relu_1 : [uses=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
+    %sum_1 : [uses=1] = call_function[target=<built-in method sum of type object at 0x7fad0a3c16a0>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
+    %topk_1 : [uses=1] = call_function[target=<built-in method topk of type object at 0x7fad0a3c16a0>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
+    return topk_1
 ```
 
 The semantics are as follows:
 
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
-  `target` is similarly the name of the argument. `args` and `kwargs` are don't-care
+  `target` is similarly the name of the argument. `args` and `kwargs` are don't-care. Placeholders correspond to
+  the function parameters (e.g. `x`) in the graph printout.
 - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
    `args` and `kwargs` are don't-care
@@ -60,6 +58,8 @@ def forward(self, x):
 - `call_method` calls a method on a value. `name` is as similar. `target` is the string name of the method
   to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on,
   _including the self argument_.
+- `output` contains the output of the traced function in its `args[0]` attribute. This corresponds to the "return" statement
+  in the Graph printout.
 
 GraphModule automatically generates Python code for the operations it symbolically observed:
 
@@ -69,16 +69,14 @@ def forward(self, x):
 
 ```
 def forward(self, x):
-    self = self.root
     linear_weight = self.linear.weight
     add_1 = x + linear_weight
     linear_1 = self.linear(add_1)
-    relu_2 = linear_1.relu()
-    sum_1 = torch.sum(relu_2, dim = -1)
+    relu_1 = linear_1.relu()
+    sum_1 = torch.sum(relu_1, dim = -1)
     topk_1 = torch.topk(sum_1, 3)
-
-
     return topk_1
+
 ```
 
 Because this code is valid PyTorch code, the resulting `GraphModule` can be used in any context another
@@ -87,6 +85,6 @@ def forward(self, x):
 
 from .graph_module import GraphModule
 from .symbolic_trace import symbolic_trace, Tracer
-from .graph import Graph, map_arg
-from .node import Node
+from .graph import Graph
+from .node import Node, map_arg
 from .proxy import Proxy
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
new file mode 100644
index 000000000000..0c5d18aa4fb2
--- /dev/null
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -0,0 +1,55 @@
+from typing import Dict, List
+from torch.fx.graph_module import GraphModule
+from typing import Any
+from torch.fx.node import Node, Target, map_arg
+from torch.fx.graph import Graph
+
+
+"""find_use is used to find out if the node is another node's arg or kwargs."""
+def find_use(arg: Any, node: Node) -> bool:
+    if isinstance(arg, (tuple, list)):
+        return any(find_use(elem, node) for elem in arg)
+    elif isinstance(arg, dict):
+        return any(find_use(v, node) for k, v in arg.items())
+    elif isinstance(arg, slice):
+        return any([find_use(arg.start, node), find_use(arg.stop, node), find_use(arg.step, node)])
+    elif isinstance(arg, Node):
+        return arg is node
+    else:
+        return False
+
+def get_all_users_of(fx_module: GraphModule, index: int) -> List[int]:
+    """Given the graph(fx_module) and an index, return a list of all node indexes that use this node"""
+    graph = fx_module.graph
+    current_node = graph.nodes[index]
+    user_indexes: List[int] = []
+    """if the node A is in node B's args, then B is the user of A
+       go through all the nodes, if the input node in any node's args,
+       then that node is the input node's user
+    """
+    for i, n in enumerate(graph.nodes):
+        if find_use(n.args, current_node) or find_use(n.kwargs, current_node):
+            user_indexes.append(i)
+    return user_indexes
+
+def replace_target_nodes_with(
+    fx_module: GraphModule,
+    old_op: str,
+    old_target: Target,
+    new_op: str,
+    new_target: Target,
+):
+    """Modifies all nodes in fx_module.graph.nodes which match the specified op code and target,
+    and updates them to match the new op code and target"""
+    new_graph = Graph()
+    val_map : Dict[Node, Node] = {}
+    for node in fx_module.graph.nodes:
+        if node.op == old_op and node.target == old_target:
+            args = map_arg(node.args, lambda n: val_map[n])
+            kwargs = map_arg(node.kwargs, lambda n: val_map[n])
+            assert isinstance(args, tuple)
+            assert isinstance(kwargs, dict)
+            val_map[node] = new_graph.create_node(new_op, new_target, args, kwargs, node.name)
+        else:
+            val_map[node] = new_graph.node_copy(node, lambda n : val_map[n])
+    fx_module.graph = new_graph
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
new file mode 100644
index 000000000000..605900cf974b
--- /dev/null
+++ b/torch/fx/experimental/Partitioner.py
@@ -0,0 +1,185 @@
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.experimental import GraphManipulation
+from typing import Dict, List, Union
+
+class DAGNode:
+    """
+    DAGNode class maintains useful information for a partition.
+    It includes parent partitions' ids, child partitions' ids, inputs(Node) and outputs(Node) of the partition.
+    """
+    def __init__(
+        self,
+        partition_id: int,
+        parents_ids: List[int],
+        children_ids: List[int],
+        input_nodes: List[Node],
+        output_nodes: List[Node]
+    ) -> None:
+        self.partition_id = partition_id
+        self.parents = parents_ids
+        self.children = children_ids
+        self.input_nodes = input_nodes
+        self.output_nodes = output_nodes
+
+    def __str__(self):
+        line: str = 'partition id: ' + str(self.partition_id) + '\n'
+        line += 'parent partitions:'
+        for parent in self.parents:
+            line += 'partition_' + str(parent) + ' '
+        line += '\n'
+        line += 'children partitions:'
+        for child in self.children:
+            line += 'partition_' + str(child) + ' '
+        line += '\n'
+        line += 'input nodes: '
+        for node in self.input_nodes:
+            line += '(' + node.name + ':' + node.op + ') '
+        line += '\n'
+        line += 'output nodes: '
+        output_nodes = self.output_nodes
+        for node in output_nodes:
+            line += '(' + node.name + ':' + node.op + ') '
+        return line
+
+class DAG:
+    """DAG class contains all the DAG nodes"""
+    def __init__(self) -> None:
+        self.nodes: List[DAGNode] = []
+
+    def create_node(
+        self,
+        partition_id: int,
+        parents_ids: List[int],
+        children_ids: List[int],
+        input_nodes: List[Node],
+        output_nodes: List[Node]
+    ) -> None:
+        node = DAGNode(partition_id, parents_ids, children_ids, input_nodes, output_nodes)
+        self.nodes.append(node)
+
+class Partition:
+    """Partition class contains all the information about an individual partition.
+    It also provides necessary methods for manipulation the partition.
+    """
+    def __init__(self, partition_id: int, fx_module: GraphModule) -> None:
+        self.graph_module = fx_module
+        self.nodes: List[Node] = []
+        self.partition_id = partition_id
+        self.parents: List[Partition] = []
+        self.children: List[Partition] = []
+
+    def add_node(self, node: Node) -> None:
+        """Append a new node into the partition."""
+        self.nodes.append(node)
+
+    def add_parent(self, partition: 'Partition') -> None:
+        self.parents.append(partition)
+
+    def add_child(self, partition: 'Partition') -> None:
+        self.children.append(partition)
+
+    def get_children(self) -> List['Partition']:
+        return self.children
+
+    def get_parents(self) -> List['Partition']:
+        return self.parents
+
+    def get_input_nodes(self) -> List[Node]:
+        """Input nodes are coming from two places:
+        placeholder and output from its parents output.
+        """
+        input_nodes: List[Node] = []
+        for node in self.nodes:
+            if node.op == 'placeholder':
+                input_nodes.append(node)
+        for partition in self.parents:
+            input_nodes += partition.get_output_nodes()
+        return input_nodes
+
+    def get_output_nodes(self) -> List[Node]:
+        """Output nodes are the nodes that without any user inside this partition."""
+        output_nodes: List[Node] = []
+        for node in self.nodes:
+            index = self.graph_module.graph.nodes.index(node)
+            user_indexes = GraphManipulation.get_all_users_of(self.graph_module, index)
+            user_nodes = {self.graph_module.graph.nodes[i] for i in user_indexes}
+            # check if user nodes has an intersection with self.nodes
+            if not set(self.nodes).intersection(user_nodes):
+                output_nodes.append(node)
+        return output_nodes
+
+    def __str__(self) -> str:
+        return str(self.partition_id)
+
+class Partitioner:
+    """A graph module may not fit into one device.
+    Partitioner class helps cut one graph into subgraphs (partitions),
+    so that each partition could fit into a different device.
+    """
+    def __init__(self) -> None:
+        """
+        After a partitioner is created,
+        it first check if multiple types of devices (backends) are involved.
+        So far, we assume that there is only one backend and one device with unlimited memory
+        """
+        self.partitions: List[Partition] = []
+        self.devices: List[Dict[str, Union[str, float]]] = []
+        self.node_to_partitions: Dict[Node, List[int]] = {}
+
+    def partition_graph(self, fx_module: GraphModule, devices: List[Dict[str, Union[str, float]]]) -> DAG:
+        """
+        Given the fx_module and devices, find the partitions, do the partitions,
+        and then return a dictionary representing the DAG of partitions
+        """
+        self.graph_module = fx_module
+        self.devices = devices
+        # Create a dummy partition as root.
+        self.root_partition = self.create_partition()
+        # So far, the whole could fit into one device since we assume the memory is unlimited.
+        # TODO: Check the available memory size for each device and see if one device is enough or multiple partitions are needed.
+        self.find_single_partition()
+        self.do_partition()
+        dag = self.dump_partition_DAG()
+        return dag
+
+    def find_single_partition(self) -> None:
+        """Only one partition (one graph on one device)."""
+        partition_0 = self.create_partition()
+        for i, node in enumerate(self.graph_module.graph.nodes):
+            if node.op == 'output':
+                break
+            self.node_to_partitions[node] = [partition_0.partition_id]
+            partition_0.add_node(node)
+        # Connect the partition to the root.
+        self.root_partition.add_child(partition_0)
+        partition_0.add_parent(self.root_partition)
+        return
+
+    def do_partition(self) -> None:
+        """Mark the partition on each node in the fx_module."""
+        for node in self.graph_module.graph.nodes:
+            if node.op == 'output':
+                break
+            node.partition_ids = self.node_to_partitions[node]
+        return
+
+    def dump_partition_DAG(self) -> DAG:
+        dag = DAG()
+        for i, partition in enumerate(self.partitions):
+            input_nodes = partition.get_input_nodes()
+            output_nodes = partition.get_output_nodes()
+            parents = partition.parents
+            parent_ids = [self.partitions.index(p) for p in parents]
+            children = partition.children
+            children_ids = [self.partitions.index(c) for c in children]
+            dag.create_node(i, parent_ids, children_ids, input_nodes, output_nodes)
+        return dag
+
+    def create_partition(self) -> Partition:
+        """Create a partition and append it to self.partitions."""
+        partition_id = len(self.partitions)
+        assert isinstance(self.graph_module, GraphModule)
+        partition = Partition(partition_id, self.graph_module)
+        self.partitions.append(partition)
+        return partition
diff --git a/torch/utils/_benchmark/examples/__init__.py b/torch/fx/experimental/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/examples/__init__.py
rename to torch/fx/experimental/__init__.py
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
new file mode 100644
index 000000000000..52264796c7d4
--- /dev/null
+++ b/torch/fx/experimental/shape_prop.py
@@ -0,0 +1,51 @@
+import torch
+import torch.fx
+from torch.fx.node import Node
+
+from typing import Dict
+
+class ShapeProp:
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env : Dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.node.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target : str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == 'call_module':
+                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'output':
+                return load_arg(node.args[0])
+
+            if isinstance(result, torch.Tensor):
+                node.shape = result.shape
+                node.dtype = result.dtype
+
+            env[node.name] = result
+
+        return None
diff --git a/torch/fx/experimental/subgraph_creation_example.py b/torch/fx/experimental/subgraph_creation_example.py
new file mode 100644
index 000000000000..dc473d53505d
--- /dev/null
+++ b/torch/fx/experimental/subgraph_creation_example.py
@@ -0,0 +1,176 @@
+import torch
+from torch.fx.graph_module import GraphModule
+from typing import Callable, List, Dict, Set, Any, Optional
+
+class Partition:
+    def __init__(self, name: str):
+        self.name: str = name
+        self.node_names: List[str] = []
+        self.inputs: Set[str] = set()
+        self.outputs: Set[str] = set()
+        self.partitions_dependent_on: Set[str] = set()
+        self.partition_dependents: Set[str] = set()
+        self.graph : torch.fx.graph.Graph = torch.fx.graph.Graph()
+        self.environment : Dict[torch.fx.node.Node, torch.fx.node.Node] = {}
+        self.targets : Dict[str, Any] = {}
+
+    def __repr__(self) -> str:
+        return f"name: {self.name},\n" \
+            f" nodes: {self.node_names},\n" \
+            f" inputs: {self.inputs},\n" \
+            f" outputs: {self.outputs},\n" \
+            f" partitions depenent on: {self.partitions_dependent_on},\n" \
+            f" parition dependents: {self.partition_dependents}"
+
+# Creates subgraphs out of main graph
+def split_module(
+    m: GraphModule,
+    root_m: torch.nn.Module,
+    split_callback: Callable[[torch.fx.node.Node], int],
+):
+    partitions: Dict[str, Partition] = {}
+    orig_nodes: Dict[str, torch.fx.node.Node] = {}
+
+    def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optional[torch.fx.node.Node]):
+        def_partition_name = getattr(def_node, '_fx_partition', None)
+        use_partition_name = getattr(use_node, '_fx_partition', None)
+        if def_partition_name != use_partition_name:
+            if def_partition_name is not None:
+                def_partition = partitions[def_partition_name]
+                def_partition.outputs.add(def_node.name)
+                if use_partition_name is not None:
+                    def_partition.partition_dependents.add(use_partition_name)
+
+            if use_partition_name is not None:
+                use_partition = partitions[use_partition_name]
+                use_partition.inputs.add(def_node.name)
+                if def_partition_name is not None:
+                    use_partition.partitions_dependent_on.add(def_partition_name)
+
+    # split nodes into parititons
+    for node in m.graph.nodes:
+        orig_nodes[node.name] = node
+
+        # TODO currently placeholders/parameters aren't put into random partitions,
+        # rather they're added to the graphs where they are used down below
+        if node.op in ["placeholder", "get_attr"]:
+            continue
+        if node.op == 'output':
+            torch.fx.graph.map_arg(node.args[0], lambda n: record_cross_partition_use(n, None))
+            continue
+        partition_name = str(split_callback(node))
+
+        # add node to partitions
+        partition = partitions.get(partition_name)
+        if partition is None:
+            partitions[partition_name] = partition = Partition(partition_name)
+
+        partition.node_names.append(node.name)
+        node._fx_partition = partition_name
+
+        torch.fx.graph.map_arg(node.args, lambda def_node: record_cross_partition_use(def_node, node))
+        torch.fx.graph.map_arg(node.kwargs, lambda def_node: record_cross_partition_use(def_node, node))
+
+    # find partitions with no dependencies
+    root_partitions : List[str] = []
+    for partition_name, partition in partitions.items():
+        if not len(partition.partitions_dependent_on):
+            root_partitions.append(partition_name)
+
+    # check partitions for circular dependencies and create topological partition ordering
+    sorted_partitions : List[str] = []
+    while root_partitions:
+        root_partition = root_partitions.pop()
+        sorted_partitions.append(root_partition)
+        for dependent in partitions[root_partition].partition_dependents:
+            partitions[dependent].partitions_dependent_on.remove(root_partition)
+            if not partitions[dependent].partitions_dependent_on:
+                root_partitions.append(dependent)
+    if len(sorted_partitions) != len(partitions):
+        raise RuntimeError("cycle exists between partitions!")
+
+    # add placeholders to parititons
+    for partition_name in sorted_partitions:
+        partition = partitions[partition_name]
+        for input in partition.inputs:
+            placeholder = partition.graph.placeholder(input)
+            partition.environment[orig_nodes[input]] = placeholder
+
+    # Transform nodes and collect targets for partition's submodule
+    for node in m.graph.nodes:
+        if hasattr(node, '_fx_partition'):
+            partition = partitions[node._fx_partition]
+
+            # swap out old graph nodes in kw/args with references to new nodes in this submodule
+            environment = partition.environment
+            gathered_args = torch.fx.graph.map_arg(node.args, lambda n : environment[n])
+            gathered_kwargs = torch.fx.graph.map_arg(node.kwargs, lambda n : environment[n])
+
+            if node.op not in ['call_module', 'get_attr']:
+                target = node.target
+            else:
+                target_atoms = node.target.split('.')
+                target_attr = m
+                for atom in target_atoms:
+                    if not hasattr(target_attr, atom):
+                        raise RuntimeError(f'Operator target {node.target} not found!')
+                    target_attr = getattr(target_attr, atom)
+                partition.targets[node.target] = target_attr
+                target = target_atoms[-1]
+
+            assert isinstance(gathered_args, tuple)
+            assert isinstance(gathered_kwargs, dict)
+            new_node = partition.graph.create_node(op=node.op, target=target, args=gathered_args,
+                                                   kwargs=gathered_kwargs)
+            partition.environment[node] = new_node
+
+    # Set up values to construct base module
+    base_mod_env : Dict[str, torch.fx.node.Node] = {}
+    base_mod_graph : torch.fx.graph.Graph = torch.fx.graph.Graph()
+    base_mod_attrs : Dict[str, torch.fx.graph_module.GraphModule] = {}
+    for node in m.graph.nodes:
+        if node.op == 'placeholder':
+            base_mod_env[node.name] = base_mod_graph.placeholder(node.name)
+        elif node.op == 'get_attr':
+            base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
+            attr_val = m
+            for atom in node.target.split('.'):
+                if not hasattr(attr_val, atom):
+                    raise RuntimeError(f'Node target {node.target} not found!')
+                attr_val = getattr(attr_val, atom)
+            base_mod_attrs[node.target] = attr_val
+
+    # Do some things iterating over the partitions in topological order again:
+    # 1) Finish off submodule Graphs by setting corresponding outputs
+    # 2) Construct GraphModules for each submodule
+    # 3) Construct the base graph by emitting calls to those submodules in
+    #    topological order
+
+    for partition_name in sorted_partitions:
+        partition = partitions[partition_name]
+
+        # Set correct output values
+        output_vals = tuple(partition.environment[orig_nodes[name]] for name in partition.outputs)
+        output_vals = output_vals[0] if len(output_vals) == 1 else output_vals  # type: ignore
+        partition.graph.output(output_vals)
+
+        # Construct GraphModule for this partition
+        submod_name = f'submod_{partition_name}'
+        base_mod_attrs[submod_name] = torch.fx.graph_module.GraphModule(partition.targets, partition.graph)
+
+        # Emit call in base graph to this submodule
+
+        output_val = base_mod_graph.call_module(submod_name, [base_mod_env[name] for name in partition.inputs])  # type: ignore
+        if len(partition.outputs) > 1:
+            # Unpack multiple return values from submodule
+            output_val_proxy = torch.fx.proxy.Proxy(output_val)
+            for i, output_name in enumerate(partition.outputs):
+                base_mod_env[output_name] = output_val_proxy[i].node  # type: ignore
+        else:
+            base_mod_env[list(partition.outputs)[0]] = output_val
+
+    for node in m.graph.nodes:
+        if node.op == 'output':
+            base_mod_graph.output(torch.fx.graph.map_arg(node.args[0], lambda n : base_mod_env[n.name]))
+
+    return torch.fx.graph_module.GraphModule(base_mod_attrs, base_mod_graph)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 1a8079ca8289..ed7618372b57 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,6 +1,6 @@
-from .node import Node, Argument, Target
+from .node import Node, Argument, Target, map_arg
 
-from typing import Callable, Any, List, Dict, Optional, Tuple
+from typing import Callable, Any, List, Dict, Optional, Tuple, Set
 import builtins
 import torch
 import keyword
@@ -52,23 +52,40 @@ def _format_target(base: str, target: str) -> str:
             r = f'{r}.{e}'
     return r
 
-def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
-    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
-    if isinstance(a, (tuple, list)):
-        return type(a)(map_arg(elem, fn) for elem in a)
-    elif isinstance(a, dict):
-        return {k: map_arg(v, fn) for k, v in a.items()}
-    elif isinstance(a, slice):
-        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
-    elif isinstance(a, Node):
-        return fn(a)
-    else:
-        return a
+class insert_before:
+    def __init__(self, n : Node):
+        self.n = n
+
+    def __enter__(self):
+        self.orig_insert_point = self.n.graph._insert_point
+        self.n.graph._insert_point = self.n
+
+    def __exit__(self, type, value, tb):
+        self.n.graph._insert_point = self.orig_insert_point
 
 class Graph:
     def __init__(self):
-        self.nodes : List[Node] = []
+        self._nodes : List[Node] = []
         self._used_names : Dict[str, int] = {}  # base name -> number
+        self._insert_point : Optional[Node] = None
+
+    @property
+    def nodes(self):
+        return tuple(self._nodes)
+
+    def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node]) -> Optional[Argument]:
+        """
+        Append all nodes from graph `g` to this graph. `val_map` should be a dictionary
+        that maps nodes in `g` to nodes in `self. `val_map` will be populated with more
+        items by this function. Returns the equivalent output value of `g` with
+        Nodes switched to refer to nodes in `self`.
+        """
+        for node in g._nodes:
+            if node.op == 'output':
+                rv = map_arg(node.args[0], lambda n: val_map[n])
+                return rv
+            val_map[node] = self.node_copy(node, lambda n : val_map[n])
+        return None
 
     def _mark_uses(self, a: Argument):
         def add_use(n: Node):
@@ -80,15 +97,45 @@ def create_node(self, op: str, target: Target,
                     args: Optional[Tuple[Argument, ...]] = None,
                     kwargs: Optional[Dict[str, Argument]] = None,
                     name: Optional[str] = None) -> Node:
-        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder')
+        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder', 'output')
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
         self._mark_uses(args)
         self._mark_uses(kwargs)
-        n = Node(self, name if name is not None else self._name(target), op, target, args, kwargs)
-        self.nodes.append(n)
+        sanitized_name = self._register_name_used(name) if name is not None else self._name(target)
+        n = Node(self, sanitized_name, op, target, args, kwargs)
+        if self._insert_point is not None:
+            before_idx = self._nodes.index(self._insert_point)
+            self._nodes.insert(before_idx, n)
+        else:
+            self._nodes.append(n)
         return n
 
+    def move_node_before(self, to_move : Node, before : Node):
+        """
+        Move node `to_move` before `before` in the Graph. Both `Node` arguments
+        must be present in this graph.
+        """
+        # TODO: Computationally inefficient
+        if to_move.graph != self or before.graph != self:
+            raise RuntimeError('Node arguments must belong to this Graph!')
+        node_idx = self._nodes.index(to_move)
+        before_idx = self._nodes.index(before)
+        self._nodes.insert(before_idx, self._nodes.pop(node_idx))
+
+
+    def erase_node(self, to_erase : Node):
+        """
+        Erases the node `to_erase` from the `Graph`. Throws an exception if
+        there are still uses of that node in the `Graph`.
+        """
+        if to_erase.uses > 0:
+            raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {to_erase.uses} uses in the graph!')
+
+        node_indices = [i for i, n in enumerate(self._nodes) if n == to_erase]
+        for idx in reversed(node_indices):
+            self._nodes.pop(idx)
+
     # sugar for above when you know the op
     def placeholder(self, name: str) -> Node:
         return self.create_node('placeholder', name)
@@ -116,7 +163,14 @@ def call_function(self,
 
     def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lambda x: x) -> Node:
         """ copy a node from one graph into another. arg_transform needs to transform arguments from the graph of node
-            to the graph of self"""
+            to the graph of self. Example:
+
+            g : torch.fx.Graph = ...
+            new_graph = torch.fx.graph()
+            value_remap = {}
+            for node in g.nodes:
+                value_remap[node] = new_graph.node_copy(node, lambda n : value_remap[n])
+        """
         args = map_arg(node.args, arg_transform)
         kwargs = map_arg(node.kwargs, arg_transform)
         assert isinstance(args, tuple)
@@ -125,12 +179,20 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lamb
             # Placeholder names are user-visible, so they should be copied as-is without normalizing them.
             name = node.name
         else:
-            name = self._name(node.name)
+            sanitized_name = node.name
+            if '_' in node.name:
+                base, maybe_idx = node.name.rsplit('_', 1)
+                try:
+                    int(maybe_idx)
+                    sanitized_name = base
+                except ValueError:
+                    pass
+            name = self._name(sanitized_name)
         return self.create_node(node.op, node.target, args, kwargs, name)
 
     def output(self, result: Argument):
-        self.result = result
         self._mark_uses(result)
+        return self.create_node(op='output', target='output', args=(result,))
 
     def _name(self, target: Target) -> str:
         if callable(target):
@@ -147,6 +209,14 @@ def _name(self, target: Target) -> str:
         if op[0].isdigit():
             op = f'_{op}'
 
+        return self._register_name_used(op)
+
+    def _register_name_used(self, op : str) -> str:
+        """
+        Even if a user provides us with a name, we must register that that
+        name is used to prevent duplication of names from further nodes as
+        well as ensure that the name provided does not shadow a builtin.
+        """
         if op not in self._used_names:
             self._used_names[op] = 0
             # Avoid shadowing PyTorch and Python builtins.
@@ -158,10 +228,11 @@ def _name(self, target: Target) -> str:
         i = self._used_names[op] = self._used_names[op] + 1
         return f'{op}_{i}'
 
-    def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
+    def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
+        modules_used : Set[str] = set()
         body: List[str] = []
-        for node in self.nodes:
+        for node in self._nodes:
             if node.op == 'placeholder':
                 assert isinstance(node.target, str)
                 free_vars.append(node.target)
@@ -183,6 +254,9 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
                     body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n')
                     continue
                 qualified_name = _qualified_name(node.target)
+                if '.' in qualified_name:
+                    module_name = qualified_name.split('.', maxsplit=1)[0]
+                    modules_used.add(module_name)
                 if qualified_name == 'getattr' and \
                    isinstance(node.args, tuple) and \
                    isinstance(node.args[1], str) and \
@@ -200,10 +274,21 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}\n')
                 continue
+            elif node.op == 'output':
+                body.append(f'return {node.args[0]}')
+                continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
-        src = ''.join(body)
-        return src, str(self.result), free_vars
+        import_block = '\n'.join(f'import {name}' for name in sorted(modules_used))
+
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
+        fn_code = f"""\
+{import_block}
+def forward(self, {', '.join(free_vars)}):
+{code}
+"""
+        return fn_code
 
     def __str__(self) -> str:
         placeholder_names : List[str] = []
@@ -232,21 +317,71 @@ def format_node(n : Node) -> Optional[str]:
                 return None
             elif n.op == 'get_attr':
                 return f'%{n.name} : [uses={n.uses}] = self.{n.target}'
+            elif n.op == 'output':
+                return f'return {n.args[0]}'
             else:
                 return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \
                        f'args = {format_arg(n.args)}, kwargs = {format_arg(n.kwargs)})'
 
 
-        node_strs = [format_node(node) for node in self.nodes]
+        node_strs = [format_node(node) for node in self._nodes]
         param_str = ', '.join(placeholder_names)
         s = f'graph({param_str}):'
         for node_str in node_strs:
             if node_str:
                 s += '\n    ' + node_str
-        if self.result:
-            s += f'\n    return {format_arg(self.result)}'
         return s
 
+    def lint(self, root : Optional[torch.nn.Module] = None):
+        """
+        Runs various checks on this Graph to make sure it is well-formed. In
+        particular:
+            - Checks Nodes have correct ownership (owned by this graph)
+            - Checks Nodes appear in topological order
+            - If `root` is provided, checks that `target`s exist in `root`
+        """
+
+        # Check topo order
+        def check_arg(arg : Node, n : Optional[Node] = None) -> None:
+            context_str = f' of Node \'{n}\' ' if n else ' '
+            if arg.graph is not self:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}does not belong to this Graph, '
+                                   f'but was used as an argument! If you are copying nodes from another graph, make '
+                                   f'sure to use `arg_transform` on node_copy() to remap values\n{self}')
+            if arg not in seen_values:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}was used before it has been '
+                                   f'defined! Please check that Nodes in the graph are topologically ordered\n{self}')
+
+        seen_names : Set[str] = set()
+        seen_values : Set[Node] = set()
+        for node in self._nodes:
+            if node.op not in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output']:
+                raise RuntimeError(f'Node {node} had unknown opcode {node.op}!')
+            if node.graph is not self:
+                raise RuntimeError(f'Node \'{node}\' does not belong to this Graph!')
+            map_arg(node.args, lambda arg: check_arg(arg, node))
+            map_arg(node.kwargs, lambda arg: check_arg(arg, node))
+            seen_values.add(node)
+
+            if node.name in seen_names:
+                raise RuntimeError(f'Node redefined name {node.name}!')
+            seen_names.add(node.name)
+
+        # Check targets are legit
+        if root:
+            for node in self._nodes:
+                if node.op in ['get_attr', 'call_module']:
+                    assert isinstance(node.target, str)
+                    target_atoms = node.target.split('.')
+                    m_itr = root
+                    for i, atom in enumerate(target_atoms):
+                        m_itr = getattr(m_itr, atom, None)
+                        if m_itr is None:
+                            seen_qualname = '.'.join(target_atoms[:i])
+                            raise RuntimeError(f'Node {node} target {node.target} references nonexistent attribute '
+                                               f'{atom} of {seen_qualname}')
+
+
 reflectable_magic_methods = {
     'add': '{} + {}',
     'sub': '{} - {}',
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 505ee991d6cc..6f72a29be184 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -28,9 +28,7 @@ def patched_getline(*args, **kwargs):
 linecache.getlines = patched_getline
 
 def _forward_from_src(src : str):
-    gbls: Dict[str, Any] = {
-        'torch': torch
-    }
+    gbls: Dict[str, Any] = {}
     exec_with_source(src, gbls)
     return gbls['forward']
 
@@ -59,7 +57,8 @@ class KeepModules(Tracer):
         def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
             return True
 
-    return KeepModules().trace(CodeOnlyModule(body))
+    com = CodeOnlyModule(body)
+    return GraphModule(com, KeepModules().trace(com))
 
 # copy an attribute value with qualified name 'target' from 'from_module' to 'to_module'
 # This installs empty Modules where none exist yet if they are subpaths of target
@@ -97,6 +96,19 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
     setattr(to_module, field, from_obj)
 
 class GraphModule(torch.nn.Module):
+    """
+    GraphModule is an nn.Module generated from an fx.Graph. GraphModule has
+    important attributes:
+
+        graph : The graph from which this GraphModule was generated
+        code : The Python source code for the function generated from `graph`
+        forward : The Python method generated from `graph`
+
+    Note that when `graph` is reassigned, `code` and `forward` will be automatically
+    regenerated. However, if you edit the contents of the `graph` without reassigning
+    the `graph` attribute itself, you must call `recompile()` to update the generated
+    code.
+    """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
         # so create a new singleton class for each instance.
@@ -148,22 +160,35 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
         else:
             raise RuntimeError('Unsupported type ' + str(root) + ' passed for root!')
         self.graph = graph
-        self._generate_forward()
-
-    def _generate_forward(self) -> None:
-        body, result, free_variables = self.graph.python_code(root_module='self')
-        body = '\n'.join('    ' + line for line in body.split('\n')) + '\n'
-        self.code = f"""\
-def forward(self, {', '.join(free_variables)}):
-{body}
-    return {result}
-"""
+
+    # TorchScript breaks trying to compile the graph setter because of the
+    # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
+    #
+    # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
+    __jit_unused_properties__ = ['graph']
+
+    @property
+    def graph(self):
+        return self._graph
+
+    @graph.setter
+    def graph(self, val) -> None:
+        self._graph = val
+        self.recompile()
+
+    def recompile(self) -> None:
+        """
+        Recompile this GraphModule from its `graph` attribute. This should be
+        called after editing the contained `graph`, otherwise the generated
+        code of this `GraphModule` will be out of date.
+        """
+        self.code = self._graph.python_code(root_module='self')
         cls = type(self)
         cls.forward = _forward_from_src(self.code)
 
     def __reduce__(self):
         dict_without_graph = self.__dict__.copy()
-        del dict_without_graph['graph']
+        del dict_without_graph['_graph']
         return (deserialize_graphmodule, (dict_without_graph,))
 
     # because __reduce__ is defined for serialization,
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 666c627ac3e6..53abead5f044 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -5,7 +5,6 @@
 if TYPE_CHECKING:
     from .graph import Graph
 
-
 BaseArgumentTypes = Union[str, int, float, bool, torch.dtype, torch.Tensor]
 base_types = BaseArgumentTypes.__args__  # type: ignore
 
@@ -25,12 +24,73 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
                  args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> None:
         self.graph = graph
         self.name = name  # unique name of value being created
-        self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|getattr
+        assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output']
+        self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|get_attr
+        if op in ['call_method', 'call_module']:
+            assert isinstance(target, str)
         self.target = target  # for method/module/function, the name of the method/module/function/attr
         # being invoked, e.g add, layer1, or torch.add
         self.args = args
         self.kwargs = kwargs
         self.uses = 0
 
+    def find_uses(self) -> List['Node']:
+        """
+        Find all nodes that use the value produced by `self`. The complexity of
+        this function is linear in the number of nodes * number of arguments to
+        each node.
+
+        Note that len(find_uses()) is not necessarily equal to attribute `uses`.
+        This node could be used multiple times in the same `Node`. In that case,
+        the user node would appear once in the return value here, but `uses` would
+        account for the total number of times this Node is used by the user node.
+        e.g. a node for `x + x` would have two uses for the `x` node, but the
+        `x + x` node would appear once in the return from `find_uses`
+        """
+        use_nodes : List[Node] = []
+        for node in self.graph._nodes:
+            def record_use(arg_node : Node) -> None:
+                if arg_node == self and (len(use_nodes) == 0 or use_nodes[-1] != node):
+                    use_nodes.append(node)
+            map_arg(node.args, record_use)
+            map_arg(node.kwargs, record_use)
+        return use_nodes
+
     def __repr__(self) -> str:
         return self.name
+
+    def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
+        """
+        Replace all uses of `self` in the Graph with the Node `replace_with`.
+        Returns the list of nodes on which this change was made.
+        """
+        use_nodes : List[Node] = self.find_uses()
+        for use_node in use_nodes:
+            def maybe_replace_node(n : Node) -> Node:
+                if n == self:
+                    self.uses -= 1
+                    return replace_with
+                else:
+                    return n
+            new_args = map_arg(use_node.args, maybe_replace_node)
+            assert isinstance(new_args, tuple)
+            use_node.args = new_args
+            new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
+            assert isinstance(new_kwargs, dict)
+            use_node.kwargs = new_kwargs
+
+        return use_nodes
+
+
+def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
+    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
+    if isinstance(a, (tuple, list)):
+        return type(a)(map_arg(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return {k: map_arg(v, fn) for k, v in a.items()}
+    elif isinstance(a, slice):
+        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
+    elif isinstance(a, Node):
+        return fn(a)
+    else:
+        return a
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 9b192dd5501f..7c295f6af133 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -119,7 +119,7 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         """
         return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)
 
-    def trace(self, root: torch.nn.Module) -> GraphModule:
+    def trace(self, root: torch.nn.Module) -> Graph:
         self.root = root
         self.graph = Graph()
 
@@ -149,10 +149,10 @@ def module_call_wrapper(mod, *args, **kwargs):
                 return _create_proxy(self, 'call_module', module_qualified_name, args, kwargs)
         try:
             torch.nn.Module.__call__ = module_call_wrapper
-            self.graph.output(self.create_arg(fn(*args)))
+            self.create_node('output', 'output', (self.create_arg(fn(*args)),), {})
         finally:
             torch.nn.Module.__call__ = orig_call
-        return GraphModule(root, self.graph)
+        return self.graph
 
     def _proxy_placeholder(self, name: str) -> Proxy:
         return Proxy(self.create_node('placeholder', name, (), {}), self)
@@ -165,4 +165,4 @@ def _proxy_placeholder(self, name: str) -> Proxy:
 # Args:
 #   - root - the `nn.Module` instance to trace
 def symbolic_trace(root : torch.nn.Module) -> GraphModule:
-    return Tracer().trace(root)
+    return GraphModule(root, Tracer().trace(root))
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 85853cd1b1ee..0eb423516f6f 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -52,6 +52,19 @@ def make_stub_from_method(nn_module, method_name):
     return make_stub(func, method_name)
 
 
+def make_stubs_from_exported_methods(mod):
+    stubs = []
+    for name in dir(mod):
+        item = getattr(mod, name, None)
+        if (
+            _jit_internal.get_torchscript_modifier(item)
+            is _jit_internal.FunctionModifiers.EXPORT
+        ):
+            stubs.append(make_stub_from_method(mod, name))
+
+    return stubs
+
+
 # base types that can be constants
 # in addition, tuples and lists of these base types are also considered constants
 # If you edit this list, then you also need to edit the handlers in
@@ -371,8 +384,8 @@ def init_fn(script_module):
             elif isinstance(orig_value, torch.jit.ScriptModule):
                 scripted = orig_value
             else:
-                # use the default recursive rule to compile the module
-                scripted = create_script_module_impl(orig_value, sub_concrete_type, infer_methods_to_compile)
+                # always reuse the provided stubs_fn to infer the methods to compile
+                scripted = create_script_module_impl(orig_value, sub_concrete_type, stubs_fn)
 
             cpp_module.setattr(name, scripted)
             script_module._modules[name] = scripted
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index f5969dbaf030..0adbefc02cee 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -18,7 +18,7 @@
 import torch
 import torch._jit_internal as _jit_internal
 from torch.utils import set_module
-from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module
+from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module, infer_methods_to_compile
 from torch.nn import Module
 from torch.jit._state import _enabled
 from torch.jit._builtins import _register_builtin
@@ -60,7 +60,8 @@ def _is_new_style_class(cls):
 
 def _compile_and_register_class(obj, rcb, qualified_name):
     ast = get_jit_class_def(obj, obj.__name__)
-    torch._C._jit_script_class_compile(qualified_name, ast, rcb)
+    defaults = torch.jit.frontend.get_default_args_for_class(obj)
+    torch._C._jit_script_class_compile(qualified_name, ast, defaults, rcb)
     torch.jit._state._add_script_class(obj, qualified_name)
 
 
@@ -199,7 +200,10 @@ def init_then_script(self, *args, **kwargs):
 
                 def make_stubs(module):
                     cls = type(module)
-                    return [v for k, v in sorted(cls._methods.items())]
+                    if hasattr(cls, "_methods"):
+                        return [v for k, v in sorted(cls._methods.items())]
+                    else:
+                        return infer_methods_to_compile(module)
 
                 self.__dict__[
                     "_actual_script_module"
@@ -272,7 +276,7 @@ class ScriptModule(with_metaclass(ScriptMeta, Module)):  # type: ignore
         contain methods, attributes, parameters, and
         constants. These can be accessed the same as on a normal ``nn.Module``.
         """
-        __ignored_properties__ = ['code', 'code_with_constants', 'graph', 'inlined_graph', 'original_name']
+        __jit_unused_properties__ = ['code', 'code_with_constants', 'graph', 'inlined_graph', 'original_name']
 
         def __init__(self):
             super(ScriptModule, self).__init__()
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 3b312c7e2161..e73785e15aea 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -22,7 +22,6 @@
 from torch.jit._script import ScriptModule, _CachedForward, script
 from torch._jit_internal import _qualified_name
 from torch.autograd import function
-from torch import _jit_internal
 from torch.nn import Module
 
 _flatten = torch._C._jit_flatten
@@ -549,23 +548,11 @@ def make_module(mod, _module_class, _compilation_unit):
         return mod
     elif torch._jit_internal.module_has_exports(mod):
 
-        def make_stubs_from_exported_methods(mod):
-            exported = []
-            for name in dir(mod):
-                item = getattr(mod, name, None)
-                if (
-                    torch._jit_internal.get_torchscript_modifier(item)
-                    is _jit_internal.FunctionModifiers.EXPORT
-                ):
-                    exported.append(name)
-
-            stubs = []
-            for method in exported:
-                stubs.append(torch.jit._recursive.make_stub_from_method(mod, method))
-            return stubs
-
+        infer_methods_stubs_fn = torch.jit._recursive.make_stubs_from_exported_methods
         return torch.jit._recursive.create_script_module(
-            mod, make_stubs_from_exported_methods, share_types=False
+            mod,
+            infer_methods_stubs_fn,
+            share_types=False
         )
     else:
         if _module_class is None:
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index d9fce627e52d..81ceea5f58df 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -271,7 +271,7 @@ def get_enum_value_type(e: Type[enum.Enum], loc):
 
 def try_ann_to_type(ann, loc):
     if ann is None:
-        return TensorType.get()
+        return TensorType.getInferred()
     if inspect.isclass(ann) and issubclass(ann, torch.Tensor):
         return TensorType.get()
     if is_tuple(ann):
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 2a6dfb498986..36dccd04b7e3 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -142,12 +142,12 @@ def get_class_properties(cls, self_name):
     props = inspect.getmembers(
         cls, predicate=lambda m: isinstance(m, property))
     # Any property that should not compiled must be in this list on the Module.
-    ignored_properties = getattr(cls, "__ignored_properties__", [])
+    unused_properties = getattr(cls, "__jit_unused_properties__", [])
 
     # Create Property TreeView objects from inspected property objects.
     properties = []
     for prop in props:
-        if prop[0] not in ignored_properties:
+        if prop[0] not in unused_properties and not should_drop(prop[1].fget):
             getter = get_jit_def(prop[1].fget, f"__{prop[0]}_getter", self_name=self_name)
             setter = get_jit_def(prop[1].fset, f"__{prop[0]}_setter", self_name=self_name) if prop[1].fset else None
             properties.append(Property(getter.range(), Ident(getter.range(), prop[0]), getter, setter))
@@ -178,6 +178,34 @@ def get_jit_class_def(cls, self_name):
     ctx = SourceContext(source, filename, file_lineno, leading_whitespace_len, False)
     return build_class_def(ctx, py_ast.body[0], methods, properties, self_name)
 
+def check_and_indent_multiline_strings(sourcelines):
+    """
+    This is a helper function which checks for multiline strings and
+    indents the strings by calculating the leading space and appending
+    the spaces to each line of the multiline string.The failure to indent
+    multiline strings causes failures during downstream dedent
+    Arguments:
+        sourcelines: This is an array of source lines of the function
+    Returns:
+        This function returns the updated indented sources,i.e,sourcelines
+    """
+    indices = []
+    triple_quotes = '\"\"\"'
+    # Extract the start and end line number of the multiline string
+    for index, source in enumerate(sourcelines):
+        if triple_quotes in source and source.find(triple_quotes) == source.rfind(triple_quotes):
+            indices.append(index)
+
+    # Adding leading space for every line of the multiline string
+    indices_length = len(indices)
+    for i in range(0, indices_length, 2):
+        if i + 1 < indices_length:
+            start = indices[i]
+            end = indices[i + 1]
+            leading_space = len(sourcelines[start]) - len(sourcelines[start].lstrip())
+            for lines in range(start + 1, end + 1):
+                sourcelines[lines] = ' ' * leading_space + sourcelines[lines]
+    return sourcelines
 
 def get_jit_def(fn, def_name, self_name=None):
     """
@@ -195,6 +223,7 @@ def _forward(self):
         self_name: If this function is a method, what the type name of `self` is.
     """
     sourcelines, file_lineno, filename = get_source_lines_and_file(fn, torch._C.ErrorReport.call_stack())
+    sourcelines = check_and_indent_multiline_strings(sourcelines)
     source = ''.join(sourcelines)
     dedent_src = dedent(source)
     py_ast = ast.parse(dedent_src)
@@ -305,6 +334,32 @@ def get_default_args(fn):
     }
 
 
+def get_default_args_for_class(cls):
+    """
+    Get default arguments for all methods in a class (except for static methods).
+
+    Args:
+        cls: type - The class type to inspect for default arguments.
+    Returns:
+        A Dict[str, Dict[str, Any]] which maps each method name to a Dict[str, Any]
+        that maps each argument name to its default value.
+    """
+    # Get methods (except static methods because those are compiled separately as
+    # if they were independent script functions).
+    methods = inspect.getmembers(
+        cls,
+        predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m))
+        and not is_static_fn(cls, m.__name__)
+        and m.__name__ in cls.__dict__
+    )
+
+    # Get method defaults. Property defaults do not need to be considered
+    # because setters cannot be invoked without a value.
+    defaults = {method_name: get_default_args(method_impl) for method_name, method_impl in methods}
+
+    return defaults
+
+
 class WithItemBuilder(Builder):
     @staticmethod
     def build_withitem(ctx, item):
@@ -631,11 +686,10 @@ def build_SliceExpr(ctx, base, slice_expr):
             return SliceExpr(base.range(), lower, upper, step)
 
         def build_Index(ctx, base, index_expr):
-            if isinstance(index_expr.value, ast.Tuple) or \
-                    isinstance(index_expr.value, ast.List):
+            if isinstance(index_expr.value, ast.Tuple):
                 raise NotSupportedError(base.range(),
                                         "slicing multiple dimensions with "
-                                        "sequences not supported yet")
+                                        "tuples not supported yet")
             return build_expr(ctx, index_expr.value)
 
         def build_ExtSlice(ctx, base, extslice):
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 68fe49f411f5..4b206f380111 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,15 +45,16 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
-  HashStore.cpp
   ProcessGroup.cpp
-  ProcessGroupRoundRobin.cpp
   Store.cpp
   PrefixStore.cpp
-  TCPStore.cpp
   Utils.cpp
   )
 
+if(NOT WIN32)
+  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp)
+endif()
+
 set(C10D_LIBS torch)
 
 if(USE_C10D_NCCL)
@@ -77,14 +78,17 @@ endif()
 add_library(c10d STATIC ${C10D_SRCS})
 set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
-target_compile_options(c10d PUBLIC
-  -Wall
-  -Wextra
-  -Wno-unused-parameter
-  -Wno-missing-field-initializers
-  -Wno-write-strings
-  -Wno-unknown-pragmas
-  )
+
+if(NOT MSVC)
+  target_compile_options(c10d PUBLIC
+    -Wall
+    -Wextra
+    -Wno-unused-parameter
+    -Wno-missing-field-initializers
+    -Wno-write-strings
+    -Wno-unknown-pragmas
+    )
+endif()
 
 add_dependencies(c10d torch)
 
@@ -118,17 +122,19 @@ if(USE_C10D_GLOO)
 endif()
 
 copy_header(FileStore.hpp)
-copy_header(HashStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
-copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 if(USE_GLOO)
   copy_header(ProcessGroupGloo.hpp)
   copy_header(GlooDeviceFactory.hpp)
 endif()
+if(NOT WIN32)
+  copy_header(HashStore.hpp)
+  copy_header(TCPStore.hpp)
+endif()
 
 if(USE_C10D_NCCL)
   copy_header(ProcessGroupNCCL.hpp)
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index 55346e0fa635..46642742d307 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -3,9 +3,16 @@
 #include <assert.h>
 #include <fcntl.h>
 #include <stdint.h>
-#include <sys/file.h>
 #include <sys/stat.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fileapi.h>
+#include <io.h>
+#else
+#include <sys/file.h>
 #include <unistd.h>
+#endif
 
 #include <chrono>
 #include <cstdio>
@@ -16,11 +23,47 @@
 #include <system_error>
 #include <thread>
 
+#include <c10/util/Exception.h>
+
 #define SYSASSERT(rv, ...)                                                 \
   if ((rv) < 0) {                                                          \
     throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
   }
 
+#ifdef _WIN32
+#define LOCK_EX 0x00000001
+#define LOCK_SH 0x00000010
+#define LOCK_UN 0x00000100
+
+int flock_(int fd, int op) {
+    HANDLE hdl = (HANDLE) _get_osfhandle(fd);
+    DWORD low = 1, high = 0;
+    OVERLAPPED offset = {0, 0, 0, 0, NULL};
+
+    if (hdl < 0)
+      return -1;
+
+    switch (op) {
+      case LOCK_EX:
+        if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_SH:
+        if (LockFileEx(hdl, 0, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_UN:
+        if(UnlockFileEx(hdl, 0, low, high, &offset) != 0)
+          return 0;
+        break;
+      default:
+        break;
+    }
+    errno = EINVAL;
+    return -1;
+}
+#endif
+
 namespace c10d {
 
 namespace {
@@ -79,7 +122,11 @@ class Lock {
   int fd_{-1};
 
   void flock(int operation) {
+#ifdef _WIN32
+    auto rv = syscall(std::bind(::flock_, fd_, operation));
+#else
     auto rv = syscall(std::bind(::flock, fd_, operation));
+#endif
     SYSASSERT(rv, "flock");
   }
 };
@@ -92,7 +139,11 @@ class File {
       std::chrono::milliseconds timeout) {
     const auto start = std::chrono::steady_clock::now();
     while (true) {
+#ifdef _WIN32
+      fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE));
+#else
       fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644));
+#endif
       // Only retry when the file doesn't exist, since we are waiting for the
       // file to be created in this case to address the following issue:
       // https://github.com/pytorch/pytorch/issues/13750
@@ -303,6 +354,14 @@ int64_t FileStore::add(const std::string& key, int64_t value) {
   return addHelper(regKey, value);
 }
 
+int64_t FileStore::getNumKeys() {
+  TORCH_CHECK(false, "getNumKeys not implemented for FileStore");
+}
+
+bool FileStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for FileStore");
+}
+
 bool FileStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> l(activeFileOpLock_);
   File file(path_, O_RDONLY, timeout_);
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
index dfca47ba7cc4..aa5d9946e5b3 100644
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@@ -21,6 +21,10 @@ class FileStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  int64_t getNumKeys() override;
+
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp
index 70c3c2bb7a31..dca6b03eb9dd 100644
--- a/torch/lib/c10d/GlooDeviceFactory.cpp
+++ b/torch/lib/c10d/GlooDeviceFactory.cpp
@@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
 
 #if GLOO_HAVE_TRANSPORT_TCP
 static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::tcp::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice);
 
 #if GLOO_HAVE_TRANSPORT_UV
 static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeUVDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::uv::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -81,23 +81,28 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
 // the flexibility of other application to override by priority. Register
 // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override.
 C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice);
+C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice);
 C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice);
 #endif
 
 static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT");
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
-    makeDeviceForInterface(const std::string& interface) {
+    makeDeviceForInterface(const std::string& interfaceName) {
   if (glooDeviceTransport) {
-    return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, "");
+    return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, "");
   }
 
 #ifdef __linux__
-  return GlooDeviceRegistry()->Create("LINUX", interface, "");
+  return GlooDeviceRegistry()->Create("LINUX", interfaceName, "");
 #endif
 
 #ifdef __APPLE__
-  return GlooDeviceRegistry()->Create("APPLE", interface, "");
+  return GlooDeviceRegistry()->Create("APPLE", interfaceName, "");
+#endif
+
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", interfaceName, "");
 #endif
 
   throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device");
@@ -117,6 +122,10 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
   return GlooDeviceRegistry()->Create("APPLE", "", hostname);
 #endif
 
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", "", hostname);
+#endif
+
   throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device");
 }
 
diff --git a/torch/lib/c10d/HashStore.cpp b/torch/lib/c10d/HashStore.cpp
index 191560d5b0fc..1bc823f0609c 100644
--- a/torch/lib/c10d/HashStore.cpp
+++ b/torch/lib/c10d/HashStore.cpp
@@ -8,6 +8,8 @@
 #include <cstdio>
 #include <system_error>
 
+#include <c10/util/Exception.h>
+
 namespace c10d {
 
 void HashStore::set(const std::string& key, const std::vector<uint8_t>& data) {
@@ -77,6 +79,14 @@ int64_t HashStore::add(const std::string& key, int64_t i) {
   return ti;
 }
 
+int64_t HashStore::getNumKeys() {
+  TORCH_CHECK(false, "getNumKeys not implemented for HashStore");
+}
+
+bool HashStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for HashStore");
+}
+
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
   for (const auto& key : keys) {
diff --git a/torch/lib/c10d/HashStore.hpp b/torch/lib/c10d/HashStore.hpp
index 0d55722efae9..1bdd67ca603c 100644
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@@ -28,8 +28,12 @@ class HashStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  int64_t getNumKeys() override;
+
   bool check(const std::vector<std::string>& keys) override;
 
+  bool deleteKey(const std::string& key) override;
+
  protected:
   std::unordered_map<std::string, std::vector<uint8_t>> map_;
   std::mutex m_;
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index a1bc17450942..5f9a3c9c21ec 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -35,6 +35,14 @@ int64_t PrefixStore::add(const std::string& key, int64_t value) {
   return store_->add(joinKey(key), value);
 }
 
+bool PrefixStore::deleteKey(const std::string& key) {
+  return store_->deleteKey(joinKey(key));
+}
+
+int64_t PrefixStore::getNumKeys() {
+  return store_->getNumKeys();
+}
+
 bool PrefixStore::check(const std::vector<std::string>& keys) {
   auto joinedKeys = joinKeys(keys);
   return store_->check(joinedKeys);
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index 86dba598ed33..cad7112fbd76 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -17,6 +17,10 @@ class PrefixStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
+  int64_t getNumKeys() override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 531fe751f1c9..c139ac7a34fd 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -2,10 +2,16 @@
 
 #include <c10d/GlooDeviceFactory.hpp>
 
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <gloo/common/win.h>
+#else
 #include <netdb.h>
 #include <sys/socket.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 
 #include <type_traits>
 
@@ -36,6 +42,36 @@
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
 
+#ifdef _WIN32
+#define GENERATE_ALL_TYPES(type, func, ...)            \
+  switch (type) {                                      \
+    case ::at::ScalarType::Float:                      \
+      func<float>(__VA_ARGS__);                        \
+      break;                                           \
+    case ::at::ScalarType::Double:                     \
+      func<double>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Half:                       \
+      func<gloo::float16>(__VA_ARGS__);                \
+      break;                                           \
+    case ::at::ScalarType::Char:                       \
+      func<int8_t>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Byte:                       \
+      func<uint8_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Int:                        \
+      func<int32_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Long:                       \
+      func<int64_t>(__VA_ARGS__);                      \
+      break;                                           \
+    default:                                           \
+      throw std::runtime_error("Invalid scalar type"); \
+  }
+
+#define HOST_NAME_MAX 256
+#else
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
     case ::at::ScalarType::Float:                      \
@@ -62,6 +98,7 @@
     default:                                           \
       throw std::runtime_error("Invalid scalar type"); \
   }
+#endif
 
 namespace c10d {
 
@@ -409,12 +446,19 @@ ProcessGroupGloo::Options::Options()
 
 namespace {
 
+void socketInitialize() {
+#ifdef _WIN32
+  ::gloo::init_winsock();
+#endif
+}
+
 // Gloo assumes that this machine's hostname can always be resolved
 // to an address. If it doesn't it throws a runtime error saying
 // that it can't be resolved. Instead of catching it, we choose
 // to proactively check if an address can be resolved, so we can
 // gracefully fall back to an alternative if it doesn't.
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
+  socketInitialize();
   struct addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
@@ -431,7 +475,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
       continue;
     }
     rv = bind(fd, rp->ai_addr, rp->ai_addrlen);
+#ifdef _WIN32
+    closesocket(fd);
+#else
     close(fd);
+#endif
     if (rv == -1) {
       continue;
     }
@@ -443,14 +491,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
 
 } // namespace
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
-    createDeviceForInterface(const std::string& interface) {
-  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface);
+    createDeviceForInterface(const std::string& interface_name) {
+  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
 }
-#endif
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDeviceForHostname(const std::string& hostname) {
   TORCH_CHECK(
@@ -460,14 +505,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       " to a (local) address");
   return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
 }
-#endif
 
-#ifdef __linux__
+#if defined(__linux__) || defined(_WIN32)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDefaultDevice() {
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
+  socketInitialize();
   std::array<char, HOST_NAME_MAX> hostname{};
   auto rv = gethostname(hostname.data(), HOST_NAME_MAX);
   if (rv != 0) {
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index a3765670c6b2..6e45b8594f9b 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -107,6 +107,13 @@ std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
   return deviceList;
 }
 
+std::string getKeySendRecv(int myRank, int peer) {
+  int lowRank = myRank < peer ? myRank : peer;
+  int highRank = myRank < peer ? peer : myRank;
+  std::string sendRecvPair = std::to_string(lowRank) + ":" + std::to_string(highRank);
+  return sendRecvPair;
+}
+
 // Get the list of devices from list of tensors
 std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
   std::vector<at::Device> res;
@@ -232,6 +239,7 @@ const int64_t ProcessGroupNCCL::kWorkCleanupThreadSleepMillis = 1000;
 constexpr int64_t kWaitForAbortCommStoreKey = 1000;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 const int64_t ProcessGroupNCCL::kProcessGroupNCCLOpTimeoutMillis = 10 * 1000;
+thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
     : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) {
@@ -243,6 +251,18 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
   ncclComms_.resize(devices.size());
 }
 
+ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
+    : std::enable_shared_from_this<WorkNCCL>(w),
+      devices_(w.devices_),
+      cudaEvents_(w.cudaEvents_),
+      ncclComms_(w.ncclComms_),
+      blockingWait_(w.blockingWait_),
+      opTimeout_(w.opTimeout_),
+      workStartTime_(w.workStartTime_) {
+  completed_ = w.completed_;
+  exception_ = w.exception_;
+}
+
 ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
 
 bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
@@ -438,6 +458,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       opTimeout_(options.opTimeout),
       futureNCCLCallbackStreams_(c10::cuda::device_count()),
       isHighPriorityStream_(options.isHighPriorityStream) {
+  TORCH_CHECK(at::cuda::getNumGPUs() != 0,
+    "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
   try {
     parseNcclBlockingWait();
   } catch (std::exception& e) {
@@ -465,30 +487,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   terminateProcessGroup_.store(true);
-  watchdogCV_.notify_one();
-  workListCV_.notify_one();
-
-  if (asyncErrorHandling_) {
-    std::unique_lock<std::mutex> lock(workListMutex_);
-    // TODO: We can potentially merge this functionality into the workCleanup
-    // thread or just allow the destructor to free workList_.
-    // Clean up any remaining items in the workList_ instead of waiting for the
-    // workCleanup Thread to be scheduled again.
-    for (auto it = workList_.begin(); it != workList_.end();
-         /* no increment*/) {
-      auto& work = *it;
-      if (work->isCompleted()) {
-        it = workList_.erase(it);
-      } else {
-        ++it;
-      }
-    }
-    // Wait for workList_ to become empty before proceeding with shutdown.
-    workListCV_.wait(lock, [&]() -> bool { return workList_.empty(); });
-    lock.unlock();
-    workCleanupThread_.join();
-  }
 
+  watchdogCV_.notify_one();
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_.join();
 #endif
@@ -504,12 +504,17 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
       }
     }
   }
+
+  if (asyncErrorHandling_) {
+    workMetaListCV_.notify_one();
+    workCleanupThread_.join();
+  }
 }
 
 void ProcessGroupNCCL::ncclCommWatchdog() {
   try {
     ncclCommWatchdogInternal();
-    LOG(INFO) << "NCCL watchdog thread terminated normally";
+    LOG(INFO) << "[Rank " << rank_ << "] NCCL watchdog thread terminated normally";
   } catch (std::exception& e) {
     LOG(INFO) << "NCCL watchdog thread terminated with exception: " << e.what();
   } catch (...) {
@@ -562,21 +567,22 @@ void ProcessGroupNCCL::ncclCommWatchdogInternal() {
     }
 
     if (asyncErrorHandling_) {
-      std::unique_lock<std::mutex> lock(workListMutex_);
-      for (auto& work : workList_) {
-        work->checkAndSetException();
+      std::unique_lock<std::mutex> lock(workMetaListMutex_);
+      for (auto& work : workMetaList_) {
+        work.checkAndSetException();
         // Aborting NCCL Communicators due to errors is already handled above.
-        if (work->exception()) {
+        if (work.exception()) {
           continue;
         }
 
         // Check for Timeouts in the WorkNCCL Operations, and abort all
         // communicators accordingly.
-        if (work->timedOut()) {
+        if (work.timedOut()) {
+          LOG(INFO) << "[" << rank_ << "] caught collective operation timeout";
           std::exception_ptr exception_ptr = std::make_exception_ptr(
               std::runtime_error("NCCL Operation Timed Out"));
-          work->setException(exception_ptr);
-          for (const auto& ncclComm : work->ncclComms_) {
+          work.setException(exception_ptr);
+          for (const auto& ncclComm : work.ncclComms_) {
             ncclComm->ncclCommAbort();
             abortedCommIds.emplace(buildNcclUniqueIdStr(ncclComm->getNcclId()));
           }
@@ -639,36 +645,38 @@ void ProcessGroupNCCL::ncclCommWatchdogInternal() {
 }
 
 void ProcessGroupNCCL::workCleanupLoop() {
-  while (!terminateProcessGroup_.load()) {
-    std::unique_lock<std::mutex> lock(workListMutex_);
-    // We busy-poll the work vector every kWatchdogThreadSleepMillis
-    // milliseconds as long as the atomic is True.
-    workListCV_.wait_for(
-        lock,
-        std::chrono::milliseconds(kWorkCleanupThreadSleepMillis),
-        [&]() -> bool { return terminateProcessGroup_.load(); });
-
-    for (auto it = workList_.begin(); it != workList_.end();
-         /* no increment*/) {
-      auto& work = *it;
-      if (work->isCompleted()) {
-        // Handle Exceptions on failed GPU operations and remove completed
-        // workNCCL objects from work vector.
-        work->handleNCCLGuard();
-        it = workList_.erase(it);
-      } else {
-        // Increment the iterator if the current WorkNCCL object is not
-        // completed.
-        ++it;
+  bool done = false;
+  while (!terminateProcessGroup_.load() || !done) {
+    std::list<WorkNCCL> doneWorks;
+    {
+      std::unique_lock<std::mutex> lock(workMetaListMutex_);
+      // We busy-poll the work vector every kWatchdogThreadSleepMillis
+      // milliseconds as long as the atomic is True.
+      workMetaListCV_.wait_for(
+          lock,
+          std::chrono::milliseconds(kWorkCleanupThreadSleepMillis),
+          [&]() -> bool { return terminateProcessGroup_.load(); });
+
+      for (auto it = workMetaList_.begin(); it != workMetaList_.end();
+           /* no increment*/) {
+        auto& work = *it;
+        if (work.isCompleted()) {
+          // Handle Exceptions on failed GPU operations and remove completed
+          // workNCCL objects from work vector.
+          if (!terminateProcessGroup_.load()) {
+            work.handleNCCLGuard();
+          }
+          doneWorks.push_back(std::move(*it));
+          it = workMetaList_.erase(it);
+        } else {
+          // Increment the iterator if the current WorkNCCL object is not
+          // completed.
+          ++it;
+        }
       }
+      done = workMetaList_.empty();
     }
-
-    if (workList_.empty()) {
-      // Notify the main thread if it is blocked in the shutdown sequence,
-      // waiting for the work vector to become empty.
-      lock.unlock();
-      workListCV_.notify_one();
-    }
+    doneWorks.clear();
   }
 }
 
@@ -717,7 +725,9 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId* ncclID) {
 
 std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     const std::string& devicesKey,
-    const std::vector<at::Device>& devices) {
+    const std::vector<at::Device>& devices,
+    NCCLCommType commType,
+    int p2pRank) {
   // Sanity check
   if (devicesKey.empty()) {
     throw std::runtime_error(
@@ -744,7 +754,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // Create the unique NCCL ID and broadcast it
   ncclUniqueId ncclID;
 
-  if (rank_ == 0) {
+  // For point-to-point communication, lower rank of the two will get unique id.
+  if (rank_ == 0 || (commType != NCCLCommType::COLL && p2pRank == 0)) {
     C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID));
   }
 
@@ -756,13 +767,41 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   std::vector<at::cuda::CUDAStream> streamVal;
   streamVal.reserve(devices.size());
 
-  // Create the NCCL communicators for each GPU
+  // [Group Start/End Note] This is used to ensure that nccl communicator will be created
+  // before communication primitives are called. Let's look at this example:
+  // Using the batch_isend_irecv to send a tensor to a target process. On the sender side,
+  // the corresponding underlying NCCL calls will look like
+  //   ncclGroupStart() // This is in batch_isend_irecv
+  //   ncclGroupStart() // This is [Note 1]
+  //   ncclCommInitRank() // Inside NCCLComm::create
+  //   ncclSend()
+  //   ncclGroupEnd() // This is [Note 2]
+  //   ncclGroupEnd() // This is in batch_isend_irecv
+  // With this pattern, the nccl communicator will be created in the last ncclGroupEnd
+  // which means when ncclSend is processed, the passed communicator argument is NULL which will
+  // lead to runtime error. So we need to "close" all active nccl groups to ensure
+  // nccl communicator is actually created before encountering any communication calls.
+  // This is why we need the following for loop.
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupEnd());
+  }
+
+  // [Note 1] Create the NCCL communicators for each GPU
   C10D_NCCL_CHECK(ncclGroupStart());
 
   for (size_t i = 0; i < devices.size(); ++i) {
     // GPU world size and GPU rank
-    int numRanks = getSize() * devices.size();
-    int rank = getRank() * devices.size() + i;
+    int numRanks, rank;
+
+    if (commType == NCCLCommType::COLL) {
+      numRanks = getSize() * devices.size();
+      rank = getRank() * devices.size() + i;
+    } else {
+    // For point-to-point operation, there are only 2 processes involved so
+    // the GPU rank is either 0 or 1.
+      numRanks = 2;
+      rank = p2pRank;
+    }
     // Get the device index
     int deviceIndex = devices[i].index();
 
@@ -781,8 +820,14 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     }
   }
 
+  // [Note 2 ]
   C10D_NCCL_CHECK(ncclGroupEnd());
 
+  // See [Group Start/End Note]
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupStart());
+  }
+
   ncclStreams_.emplace(devicesKey, std::move(streamVal));
 
   // Note: these events are created with the (default) cudaEventDisableTiming
@@ -928,8 +973,12 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupNCCL::WorkNCCL::
 void ProcessGroupNCCL::workEnqueue(
     std::shared_ptr<ProcessGroupNCCL::WorkNCCL> work) {
   if (!terminateProcessGroup_.load()) {
-    std::lock_guard<std::mutex> lock(workListMutex_);
-    workList_.emplace_back(std::move(work));
+    std::lock_guard<std::mutex> lock(workMetaListMutex_);
+    // Avoid view tensors to be processed in cleanup thread.
+    // View tensors' destruction invokes autograd_meta, which
+    // needs to be destructed in user thread. Otherwise will
+    // get deadlock. Here we enqueue work without outputs_.
+    workMetaList_.emplace_back(WorkNCCL(*work));
   }
 }
 ProcessGroupNCCL::Options::Options()
@@ -1006,6 +1055,77 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
   return work;
 }
 
+template <typename Fn, typename PreProcess, typename PostProcess>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensors,
+    Fn fn,
+    int peer,
+    NCCLCommType commType,
+    PreProcess pre,
+    PostProcess post) {
+  const auto devices = getDeviceList(tensors);
+  const auto key = getKeySendRecv(rank_, peer);
+  int p2pRank = rank_ < peer ? 0 : 1;
+  auto& ncclComms = getNCCLComm(key, devices, commType, p2pRank);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStreams(devices, ncclEvents_[key], ncclStreams_[key]);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  auto work = initWork(devices);
+
+  if (commType == NCCLCommType::RECV) {
+    // Store references to outputs and futureNCCLCallbackStream to be used by
+    // WorkNCCL::getFuture.
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>(tensors);
+    work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_;
+  }
+
+  at::cuda::OptionalCUDAGuard gpuGuard;
+
+  pre(ncclStreams_[key]);
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+
+    // Both send tensor and recv tensor are created on a worker stream and used in
+    // different ncclStreams.  Hence, both must record the ncclStream to
+    // prevent being freed before the collective finishes.
+    //
+    // See [Sync Streams].
+    c10::cuda::CUDACachingAllocator::recordStream(
+        tensors[i].storage().data_ptr(), ncclStream);
+  }
+
+  {
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      gpuGuard.set_index(devices[i].index());
+      at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+      // For point-to-point communication, NCCL ranks can only
+      // be 0 or 1.
+      int p2pTargetRank = 1 - p2pRank;
+      C10D_NCCL_CHECK(
+          fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank));
+    }
+  }
+
+  post(ncclStreams_[key]);
+
+  // Event should only be recorded after the ncclGroupEnd()
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+    (*work->cudaEvents_)[i].record(ncclStream);
+    work->ncclComms_[i] = ncclComms[i];
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = opTimeout_;
+    work->store_ = store_;
+  }
+
+  return work;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
@@ -1019,6 +1139,21 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
       [](std::vector<at::cuda::CUDAStream>&) {});
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensor,
+    Fn fn,
+    int peer,
+    NCCLCommType type) {
+  return pointToPoint(
+      tensor,
+      fn,
+      peer,
+      type,
+      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&) {});
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
@@ -1294,6 +1429,54 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
         });
   }
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& tensors,
+    int dstRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret = pointToPoint(
+      tensors,
+      [&](at::Tensor& input,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream,
+          int dst) {
+        return ncclSend(
+            input.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            dst,
+            comm,
+            stream.stream());
+      },
+      dstRank,
+      NCCLCommType::SEND);
+  return ret;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& tensors,
+    int srcRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret= pointToPoint(
+      tensors,
+      [&](at::Tensor& output,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream,
+          int src) {
+        return ncclRecv(
+            output.data_ptr(),
+            output.numel(),
+            getNcclDataType(output.scalar_type()),
+            src,
+            comm,
+            stream.stream());
+      },
+      srcRank,
+      NCCLCommType::RECV);
+  return ret;
+}
 #else
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& /* unused */,
@@ -1304,8 +1487,38 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   throw std::runtime_error(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports recv for NCCL lib version >= 2.7.0");
+}
 #endif
 
+void ProcessGroupNCCL::groupStart() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupStart());
+#endif
+  ++ncclActiveGroupCounter_;
+}
+
+void ProcessGroupNCCL::groupEnd() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupEnd());
+#endif
+  --ncclActiveGroupCounter_;
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& /* unused */,
     std::vector<at::Tensor>& /* unused */,
@@ -1327,24 +1540,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
   throw std::runtime_error("ProcessGroupNCCL does not support scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support send");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
-}
-
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
+  throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_base(
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 5a5e5a718ad8..b8b3d5aabd35 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -23,6 +23,13 @@ constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT";
 // Handling with NCCL.
 constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING";
 
+// NCCL Commmunication type
+enum class NCCLCommType : std::uint8_t {
+  SEND = 0,
+  RECV,
+  COLL,
+};
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -65,6 +72,11 @@ class ProcessGroupNCCL : public ProcessGroup {
    public:
     // Constructor takes a list of CUDA devices
     WorkNCCL(const std::vector<at::Device>& devices);
+    // Copy constructor doing partial copy without outputs_. Cleanup thread
+    // monitors and removes finished works. However it will deadlock when
+    // destructs outputs_ tensors who are view tensors in autograd graph.
+    WorkNCCL(const WorkNCCL& w);
+
     virtual ~WorkNCCL();
 
     // Checks if request has completed. In this specific case of NCCL, it checks
@@ -415,6 +427,20 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  std::shared_ptr<ProcessGroup::Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  static void groupStart();
+
+  static void groupEnd();
+
   // Unsupported Ops
   std::shared_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
@@ -426,16 +452,6 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) override;
-
-  std::shared_ptr<ProcessGroup::Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) override;
-
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
@@ -450,7 +466,9 @@ class ProcessGroupNCCL : public ProcessGroup {
   // a new set of NCCL communicators as a cache entry
   std::vector<std::shared_ptr<NCCLComm>>& getNCCLComm(
       const std::string& devicesKey,
-      const std::vector<at::Device>& devices);
+      const std::vector<at::Device>& devices,
+      NCCLCommType commType = NCCLCommType::COLL,
+      int p2pRank = 0);
 
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
@@ -479,6 +497,24 @@ class ProcessGroupNCCL : public ProcessGroup {
       PreProcess pre,
       PostProcess post);
 
+  // Helper that encapsulates work shared across point-to-point communication
+  // primitives. It is the same structure as the helper used for collective
+  // communicaiton primitives.
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      int peer,
+      NCCLCommType commType);
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      int peer,
+      NCCLCommType commType,
+      PreProcess pre,
+      PostProcess post);
+
   // Checks for NCCL errors on each of the communicators and returns an
   // appropriate exception_ptr (nullptr if no errors).
   static std::exception_ptr checkForNCCLErrorsInternal(
@@ -520,6 +556,8 @@ class ProcessGroupNCCL : public ProcessGroup {
   uint64_t ncclCommCounter_{0};
 
   // The NCCL communicator that the process group has cached.
+  //
+  // For collective operations:
   // The key is a list of GPU devices that an operation is operating on
   // The GPU devices are stored in a device sequence and the cache NCCL
   // communicator is associated with this GPU device sequence
@@ -538,6 +576,13 @@ class ProcessGroupNCCL : public ProcessGroup {
   //      "0,4,5,6,7,1,2,3"
   //
   //      Note that the order of the device for the tensor list matters.
+  //
+  // For point-to-point operations:
+  // The key is a string of my current rank and the peer process rank.
+  // e.g. If process 1 and process 2 are involved in a point-to-point communication,
+  // the key will be "1:2" on both processes.
+  // Note: this is for the scenario where there is only 1 GPU per process.
+  // When it comes to multiple GPUs per process, this part may need to redesigned.
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
       devNCCLCommMap_;
 
@@ -563,14 +608,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   // Thread that removes NCCL Work upon timeout
   std::thread workCleanupThread_;
 
-  // Mutex to Guard workList_
-  std::mutex workListMutex_;
+  // Mutex to Guard workMetaList_
+  std::mutex workMetaListMutex_;
 
   // Condition Variable for timeout thread sleep
-  std::condition_variable workListCV_;
+  std::condition_variable workMetaListCV_;
 
   // Vector to Store WorkNCCL pointers
-  std::list<std::shared_ptr<ProcessGroupNCCL::WorkNCCL>> workList_;
+  std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
 
   // Add Work Pointer to workVector
   void workEnqueue(std::shared_ptr<ProcessGroupNCCL::WorkNCCL>);
@@ -634,6 +679,11 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   // Schedule NCCL operations on high priority CUDA streams.
   bool isHighPriorityStream_ = false;
+
+  // The number of active ncclGroupStart() calls. This counter will be increased
+  // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
+  // is called.
+  static thread_local uint64_t ncclActiveGroupCounter_;
 };
 
 } // namespace c10d
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index 8e313fda9767..e42bbf300e0b 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -30,8 +30,12 @@ class Store {
 
   virtual int64_t add(const std::string& key, int64_t value) = 0;
 
+  virtual bool deleteKey(const std::string& key) = 0;
+
   virtual bool check(const std::vector<std::string>& keys) = 0;
 
+  virtual int64_t getNumKeys() = 0;
+
   virtual void wait(const std::vector<std::string>& keys) = 0;
 
   virtual void wait(
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index dfd33cfb77ca..55705005aad0 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -10,7 +10,7 @@ namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -180,6 +180,12 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::WAIT) {
     waitHandler(socket);
 
+  } else if (qt == QueryType::GETNUMKEYS) {
+    getNumKeysHandler(socket);
+
+  } else if (qt == QueryType::DELETE) {
+    deleteHandler(socket);
+
   } else {
     throw std::runtime_error("Unexpected query type");
   }
@@ -228,6 +234,16 @@ void TCPStoreDaemon::getHandler(int socket) const {
   tcputil::sendVector<uint8_t>(socket, data);
 }
 
+void TCPStoreDaemon::getNumKeysHandler(int socket) const {
+  tcputil::sendValue<int64_t>(socket, tcpStore_.size());
+}
+
+void TCPStoreDaemon::deleteHandler(int socket) {
+  std::string key = tcputil::recvString(socket);
+  auto numDeleted = tcpStore_.erase(key);
+  tcputil::sendValue<int64_t>(socket, numDeleted);
+}
+
 void TCPStoreDaemon::checkHandler(int socket) const {
   SizeType nargs;
   tcputil::recvBytes<SizeType>(socket, &nargs, 1);
@@ -357,6 +373,14 @@ int64_t TCPStore::add(const std::string& key, int64_t value) {
   return addHelper_(regKey, value);
 }
 
+bool TCPStore::deleteKey(const std::string& key) {
+  std::string regKey = regularPrefix_ + key;
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE);
+  tcputil::sendString(storeSocket_, regKey, true);
+  auto numDeleted = tcputil::recvValue<int64_t>(storeSocket_);
+  return (numDeleted == 1);
+}
+
 int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::ADD);
   tcputil::sendString(storeSocket_, key, true);
@@ -364,6 +388,11 @@ int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   return tcputil::recvValue<int64_t>(storeSocket_);
 }
 
+int64_t TCPStore::getNumKeys() {
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::GETNUMKEYS);
+  return tcputil::recvValue<int64_t>(storeSocket_);
+}
+
 bool TCPStore::check(const std::vector<std::string>& keys) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::CHECK);
   SizeType nkeys = keys.size();
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index 29733639bd59..d26df3e9e8ab 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -26,6 +26,8 @@ class TCPStoreDaemon {
   void addHandler(int socket);
   void getHandler(int socket) const;
   void checkHandler(int socket) const;
+  void getNumKeysHandler(int socket) const;
+  void deleteHandler(int socket);
   void waitHandler(int socket);
 
   bool checkKeys(const std::vector<std::string>& keys) const;
@@ -61,8 +63,12 @@ class TCPStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
+  int64_t getNumKeys() override;
+
   void wait(const std::vector<std::string>& keys) override;
 
   void wait(
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index d975f6eb6bc5..6c6e941ef95d 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -1,5 +1,6 @@
 #include <c10d/Utils.hpp>
 
+#ifndef _WIN32
 #include <netdb.h>
 #include <sys/poll.h>
 
@@ -354,6 +355,6 @@ std::tuple<int, std::string> accept(
   return std::make_tuple(
       socket, sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addr)));
 }
-
 } // namespace tcputil
 } // namespace c10d
+#endif
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 1bdaddde9f24..1116cd39ba1c 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
+#ifndef _WIN32
 #include <sys/socket.h>
+#endif
 #include <sys/types.h>
 
 #include <chrono>
@@ -480,6 +482,7 @@ class ResourceGuard {
   bool released_;
 };
 
+#ifndef _WIN32
 namespace tcputil {
 
 constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
@@ -609,4 +612,5 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout = kNoTimeout);
 
 } // namespace tcputil
+#endif
 } // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 8429d1099b29..003f56f30861 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -8,14 +8,19 @@ function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
   target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-  target_link_libraries(${test_name} pthread ${ARGN})
-  target_compile_options(${test_name} PRIVATE -Wno-error)
+  target_link_libraries(${test_name} ${ARGN})
+  if(NOT WIN32)
+    target_link_libraries(${test_name} pthread)
+    target_compile_options(${test_name} PRIVATE -Wno-error)
+  endif()
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d gtest_main)
-c10d_add_test(HashStoreTest.cpp c10d gtest_main)
-c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+if(NOT WIN32)
+  c10d_add_test(HashStoreTest.cpp c10d gtest_main)
+  c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+endif()
 
 if(USE_CUDA)
   if(USE_C10D_GLOO)
@@ -29,7 +34,7 @@ if(USE_CUDA)
   endif()
 else()
   if(USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
   endif()
 endif()
 
diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp
index defaff895a18..328da2faf648 100644
--- a/torch/lib/c10d/test/CUDATest.hpp
+++ b/torch/lib/c10d/test/CUDATest.hpp
@@ -5,9 +5,15 @@
 namespace c10d {
 namespace test {
 
-void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+#ifdef _WIN32
+#define EXPORT_TEST_API __declspec(dllexport)
+#else
+#define EXPORT_TEST_API
+#endif
 
-int cudaNumDevices();
+EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+
+EXPORT_TEST_API int cudaNumDevices();
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
index 77215f4521c2..cc8da6326091 100644
--- a/torch/lib/c10d/test/FileStoreTest.cpp
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -1,6 +1,8 @@
 #include <c10d/test/StoreTestCommon.hpp>
 
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 
 #include <iostream>
 #include <thread>
@@ -10,6 +12,11 @@
 #include <c10d/FileStore.hpp>
 #include <c10d/PrefixStore.hpp>
 
+#ifdef _WIN32
+std::string tmppath() {
+  return c10d::test::autoGenerateTmpFilePath();
+}
+#else
 std::string tmppath() {
   const char* tmpdir = getenv("TMPDIR");
   if (tmpdir == nullptr) {
@@ -29,6 +36,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 void testGetSet(std::string path, std::string prefix = "") {
   // Basic Set/Get on File Store
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 6606e553e733..da4f9b5fc106 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -1,7 +1,10 @@
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 
 #include <condition_variable>
 #include <iostream>
@@ -21,6 +24,7 @@ using namespace c10d::test;
 constexpr auto kSendDelay = std::chrono::milliseconds(100);
 constexpr auto kWaitTimeout = std::chrono::milliseconds(1);
 
+#ifndef _WIN32
 class SignalTest {
  public:
   SignalTest(const std::string& path) : path_(path) {}
@@ -92,6 +96,7 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
   test.arm(fork.pid, signal);
   return test.run(0, 2);
 }
+#endif
 
 class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
  public:
@@ -456,6 +461,7 @@ void testRecv(const std::string& path) {
   EXPECT_TRUE(recvCompleted);
 }
 
+#ifndef _WIN32
 TEST(ProcessGroupGlooTest, testSIGSTOPException) {
   // test SIGSTOP
   // Fork() and TSAN don't play well together, so skip the test if we're testing
@@ -485,6 +491,7 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) {
   EXPECT_FALSE(work->isSuccess());
   EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception);
 }
+#endif
 
 TEST(ProcessGroupGlooTest, testAllReduceCPU) {
   {
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 66176d3e7355..916e5bedd94a 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -36,6 +36,22 @@ void testHelper(const std::string& prefix = "") {
     c10d::test::check(*serverStore, "key0", "value0");
     c10d::test::check(*serverStore, "key1", "value1");
     c10d::test::check(*serverStore, "key2", "value2");
+    serverStore->add("counter", 1);
+    auto numKeys = serverStore->getNumKeys();
+    // We expect 5 keys since 3 are added above, 'counter' is added by the
+    // helper thread, and the init key to coordinate workers.
+    EXPECT_EQ(numKeys, 5);
+
+    auto delSuccess = serverStore->deleteKey("key0");
+    // Ensure that the key was successfully deleted
+    EXPECT_TRUE(delSuccess);
+    auto delFailure = serverStore->deleteKey("badKeyName");
+    // The key was not in the store so the delete operation should have failed
+    // and returned false.
+    EXPECT_FALSE(delFailure);
+    numKeys = serverStore->getNumKeys();
+    EXPECT_EQ(numKeys, 4);
+    EXPECT_THROW(serverStore->get("key0"), std::runtime_error);
   });
 
   // Hammer on TCPStore
@@ -53,7 +69,7 @@ void testHelper(const std::string& prefix = "") {
         new c10d::PrefixStore(prefix, clientTCPStores[i])));
   }
 
-  std::string expectedCounterRes = std::to_string(numThreads * numIterations);
+  std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1);
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(
diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp
index c62695485573..5f5dfca315cb 100644
--- a/torch/lib/c10d/test/TestUtils.hpp
+++ b/torch/lib/c10d/test/TestUtils.hpp
@@ -1,9 +1,12 @@
 #pragma once
 
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 #include <cstring>
 
 #include <condition_variable>
@@ -37,6 +40,28 @@ class Semaphore {
   std::condition_variable cv_;
 };
 
+#ifdef _WIN32
+std::string autoGenerateTmpFilePath() {
+  char tmp[L_tmpnam_s];
+  errno_t err;
+  err = tmpnam_s(tmp, L_tmpnam_s);
+  if (err != 0)
+  {
+    throw std::system_error(errno, std::system_category());
+  }
+  return std::string(tmp);
+}
+
+std::string tmppath() {
+  const char* tmpfile = getenv("TMPFILE");
+  if (tmpfile) {
+    return std::string(tmpfile);
+  }
+  else {
+    return autoGenerateTmpFilePath();
+  }
+}
+#else
 std::string tmppath() {
   // TMPFILE is for manual test execution during which the user will specify
   // the full temp file path using the environmental variable TMPFILE
@@ -63,6 +88,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 bool isTSANEnabled() {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
@@ -80,6 +106,7 @@ struct TemporaryFile {
   }
 };
 
+#ifndef _WIN32
 struct Fork {
   pid_t pid;
 
@@ -101,6 +128,7 @@ struct Fork {
     return pid == 0;
   }
 };
+#endif
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index edde49a1d358..72d55d30ad6d 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1289,8 +1289,7 @@ def celu(input, alpha=1., inplace=False):
 """)
 
 
-def leaky_relu(input, negative_slope=0.01, inplace=False):
-    # type: (Tensor, float, bool) -> Tensor
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False) -> Tensor:
     r"""
     leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
 
@@ -2583,16 +2582,10 @@ def binary_cross_entropy_with_logits(input, target, weight=None, size_average=No
     return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
 
 
-def _smooth_l1_loss(input, target):
-    # type: (Tensor, Tensor) -> Tensor
-    t = torch.abs(input - target)
-    return torch.where(t < 1, 0.5 * t ** 2, t - 0.5)
-
-
-def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
-    # type: (Tensor, Tensor, Optional[bool], Optional[bool], str) -> Tensor
+def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mean', beta=1.0):
+    # type: (Tensor, Tensor, Optional[bool], Optional[bool], str, float) -> Tensor
     r"""Function that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
 
     See :class:`~torch.nn.SmoothL1Loss` for details.
     """
@@ -2601,7 +2594,7 @@ def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mea
         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
             return handle_torch_function(
                 smooth_l1_loss, tens_ops, input, target, size_average=size_average,
-                reduce=reduce, reduction=reduction)
+                reduce=reduce, reduction=reduction, beta=beta)
     if not (target.size() == input.size()):
         warnings.warn("Using a target size ({}) that is different to the input size ({}). "
                       "This will likely lead to incorrect results due to broadcasting. "
@@ -2611,7 +2604,7 @@ def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mea
         reduction = _Reduction.legacy_get_string(size_average, reduce)
 
     expanded_input, expanded_target = torch.broadcast_tensors(input, target)
-    return torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+    return torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction), beta)
 
 
 def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
@@ -2832,6 +2825,8 @@ def multi_margin_loss(input, target, p=1, margin=1., weight=None, size_average=N
 
 
 pixel_shuffle = _add_docstr(torch.pixel_shuffle, r"""
+pixel_shuffle(input, upscale_factor) -> Tensor
+
 Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
 tensor of shape :math:`(*, C, H \times r, W \times r)`.
 
@@ -3331,6 +3326,9 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner
                        or :math:`(N, D_\text{out}, H_\text{out}, W_\text{out}, 3)` (5-D case)
         mode (str): interpolation mode to calculate output values
             ``'bilinear'`` | ``'nearest'``. Default: ``'bilinear'``
+            Note: When ``mode='bilinear'`` and the input is 5-D, the interpolation mode
+            used internally will actually be trilinear. However, when the input is 4-D,
+            the interpolation mode will legitimately be bilinear.
         padding_mode (str): padding mode for outside grid values
             ``'zeros'`` | ``'border'`` | ``'reflection'``. Default: ``'zeros'``
         align_corners (bool, optional): Geometrically, we consider the pixels of the
@@ -3728,6 +3726,42 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, s
                                      swap, reduction_enum)
 
 
+def triplet_margin_with_distance_loss(anchor, positive, negative, *, distance_function=None,
+                                      margin=1.0, swap=False, reduction="mean"):
+    # type: (Tensor, Tensor, Tensor, Optional[Callable[[Tensor, Tensor], Tensor]], float, bool, str) -> Tensor
+    r"""
+    See :class:`~torch.nn.TripletMarginWithDistanceLoss` for details.
+    """
+    if torch.jit.is_scripting():
+        raise NotImplementedError("F.triplet_margin_with_distance_loss does not support JIT scripting: "
+                                  "functions requiring Callables cannot be scripted.")
+
+    tens_ops = (anchor, positive, negative)
+    if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+        return handle_torch_function(
+            triplet_margin_with_distance_loss, tens_ops, anchor, positive, negative,
+            distance_function=distance_function, margin=margin, swap=swap, reduction=reduction)
+
+    distance_function = distance_function if distance_function is not None else pairwise_distance
+
+    positive_dist = distance_function(anchor, positive)
+    negative_dist = distance_function(anchor, negative)
+
+    if swap:
+        swap_dist = distance_function(positive, negative)
+        negative_dist = torch.min(negative_dist, swap_dist)
+
+    output = torch.clamp(positive_dist - negative_dist + margin, min=0.0)
+
+    reduction_enum = _Reduction.get_enum(reduction)
+    if reduction_enum == 1:
+        return output.mean()
+    elif reduction_enum == 2:
+        return output.sum()
+    else:
+        return output
+
+
 def normalize(input, p=2, dim=1, eps=1e-12, out=None):
     # type: (Tensor, float, int, float, Optional[Tensor]) -> Tensor
     r"""Performs :math:`L_p` normalization of inputs over specified dimension.
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index d7656b72425a..215fb0278dc6 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -22,9 +22,9 @@ GRID_SAMPLE_PADDING_MODES = Dict[str, int]
 # This was necessary since the JIT uses BroadcastingList* types but static checking with mypy etc requires a `Sequence`
 # type. There is no way to express the expected lengths of these lists in the current Python typing system.
 #
-# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were 
-# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code 
-# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system 
+# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were
+# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code
+# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system
 # to encode the type semantics of `_add_docstr`, should that system ever become widespread.
 def fractional_max_pool2d_with_indices(input: Tensor, kernel_size: _size, output_size: Optional[_size] = ...,
                                        output_ratio: Optional[_ratio_any_t] = ..., return_indices: bool = ...,
@@ -319,6 +319,11 @@ def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, marg
                         reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...
 
 
+def triplet_margin_with_distance_loss(anchor: Tensor, positive: Tensor, negative: Tensor, *,
+                                      distance_function: Optional[Callable[[Tensor, Tensor], Tensor]]=...,
+                                      margin: float=..., swap: bool=..., reduction: str=...) -> Tensor: ...
+
+
 def normalize(input: Tensor, p: float = ..., dim: int = ..., eps: float = ...,
               out: Optional[Tensor] = ...) -> Tensor: ...
 
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index db46bb5ac2ee..5a8b0f042db1 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -162,7 +162,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -170,10 +170,9 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
-            qconfig = mod.qconfig
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        qconfig = mod.qconfig
         conv, bn = mod[0], mod[1]
         qat_convbn = cls(conv.in_channels, conv.out_channels, conv.kernel_size,
                          conv.stride, conv.padding, conv.dilation,
@@ -278,8 +277,8 @@ def forward(self, input):
         return F.relu(ConvBn2d._forward(self, input))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(ConvBnReLU2d, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(ConvBnReLU2d, cls).from_float(mod)
 
 class ConvReLU2d(nnqat.Conv2d):
     r"""
@@ -313,8 +312,8 @@ def forward(self, input):
             self._conv_forward(input, self.weight_fake_quant(self.weight)))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(ConvReLU2d, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(ConvReLU2d, cls).from_float(mod)
 
 def update_bn_stats(mod):
     if type(mod) in set([ConvBnReLU2d, ConvBn2d]):
diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
index 03f556c4ac2e..b11072ddb7be 100644
--- a/torch/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -34,5 +34,5 @@ def forward(self, input):
         return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(LinearReLU, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(LinearReLU, cls).from_float(mod)
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index b5a03d4a049d..06a565700550 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -8,8 +8,8 @@
     Hardsigmoid, Hardswish, SiLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
     CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
-    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
-    SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, \
+    SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss
 from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
 from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
     MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, \
@@ -54,5 +54,5 @@
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
     'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder',
     'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer',
-    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU',
+    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'TripletMarginWithDistanceLoss'
 ]
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 075311870439..f5ca6deb5b19 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -114,7 +114,7 @@ def forward(self, input: Tensor) -> Tensor:
                 else:  # use exponential moving average
                     exponential_average_factor = self.momentum
 
-        r""" 
+        r"""
         Decide whether the mini-batch stats should be used for normalization rather than the buffers.
         Mini-batch stats are used in training mode, and in eval mode when buffers are None.
         """
@@ -185,7 +185,7 @@ class BatchNorm1d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -258,7 +258,7 @@ class BatchNorm2d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -332,7 +332,7 @@ class BatchNorm3d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -414,7 +414,7 @@ class SyncBatchNorm(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
         process_group: synchronization of stats happen within each process group
@@ -493,7 +493,7 @@ def forward(self, input: Tensor) -> Tensor:
             else:  # use exponential moving average
                 exponential_average_factor = self.momentum
 
-        r""" 
+        r"""
         Decide whether the mini-batch stats should be used for normalization rather than the buffers.
         Mini-batch stats are used in training mode, and in eval mode when buffers are None.
         """
@@ -576,6 +576,8 @@ def convert_sync_batchnorm(cls, module, process_group=None):
             module_output.running_mean = module.running_mean
             module_output.running_var = module.running_var
             module_output.num_batches_tracked = module.num_batches_tracked
+            if hasattr(module, "qconfig"):
+                module_output.qconfig = module.qconfig
         for name, child in module.named_children():
             module_output.add_module(name, cls.convert_sync_batchnorm(child, process_group))
         del module
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index c9db80d64fdb..f5d07ae4a69c 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -429,6 +429,11 @@ def __setitem__(self, idx: int, param: 'Parameter') -> None:
         idx = self._get_abs_string_index(idx)
         return self.register_parameter(str(idx), param)
 
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if not isinstance(value, torch.nn.Parameter):
+            warnings.warn("Setting attributes on ParameterList is not supported.")
+        super(ParameterList, self).__setattr__(key, value)
+
     def __len__(self) -> int:
         return len(self._parameters)
 
@@ -480,6 +485,13 @@ def extra_repr(self) -> str:
     def __call__(self, input):
         raise RuntimeError('ParameterList should not be called.')
 
+    def _replicate_for_data_parallel(self):
+        warnings.warn("nn.ParameterList is being used with DataParallel but this is not "
+                      "supported. This list will appear empty for the models replicated "
+                      "on each GPU except the original one.")
+
+        return super(ParameterList, self)._replicate_for_data_parallel()
+
 
 class ParameterDict(Module):
     r"""Holds parameters in a dictionary.
@@ -533,6 +545,11 @@ def __setitem__(self, key: str, parameter: 'Parameter') -> None:
     def __delitem__(self, key: str) -> None:
         del self._parameters[key]
 
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if not isinstance(value, torch.nn.Parameter):
+            warnings.warn("Setting attributes on ParameterDict is not supported.")
+        super(ParameterDict, self).__setattr__(key, value)
+
     def __len__(self) -> int:
         return len(self._parameters)
 
@@ -621,3 +638,10 @@ def extra_repr(self) -> str:
 
     def __call__(self, input):
         raise RuntimeError('ParameterDict should not be called.')
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn("nn.ParameterDict is being used with DataParallel but this is not "
+                      "supported. This dict will appear empty for the models replicated "
+                      "on each GPU except the original one.")
+
+        return super(ParameterDict, self)._replicate_for_data_parallel()
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 62323fda40f4..796861b0f4b3 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1,11 +1,12 @@
 import warnings
 
+from .distance import PairwiseDistance
 from .module import Module
 from .. import functional as F
 from .. import _reduction as _Reduction
 
 from torch import Tensor
-from typing import Optional
+from typing import Callable, Optional
 
 
 class _Loss(Module):
@@ -41,8 +42,8 @@ class L1Loss(_Loss):
     .. math::
         \ell(x, y) =
         \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
@@ -129,9 +130,9 @@ class NLLLoss(_WeightedLoss):
     .. math::
         \ell(x, y) = \begin{cases}
             \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
-            \text{if reduction} = \text{'mean';}\\
+            \text{if reduction} = \text{`mean';}\\
             \sum_{n=1}^N l_n,  &
-            \text{if reduction} = \text{'sum'.}
+            \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     Can also be used for higher dimension inputs, such as 2D images, by providing
@@ -321,8 +322,8 @@ class KLDivLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';} \\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';} \\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
@@ -395,8 +396,8 @@ class MSELoss(_Loss):
     .. math::
         \ell(x, y) =
         \begin{cases}
-            \operatorname{mean}(L), &  \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  &  \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
@@ -460,8 +461,8 @@ class BCELoss(_WeightedLoss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     This is used for measuring the error of a reconstruction in for example
@@ -547,8 +548,8 @@ class BCEWithLogitsLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     This is used for measuring the error of a reconstruction in for example
@@ -650,8 +651,8 @@ class HingeEmbeddingLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     where :math:`L = \{l_1,\dots,l_N\}^\top`.
@@ -757,7 +758,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
     prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
@@ -770,13 +771,18 @@ class SmoothL1Loss(_Loss):
     .. math::
         z_{i} =
         \begin{cases}
-        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
-        |x_i - y_i| - 0.5, & \text{otherwise }
+        0.5 (x_i - y_i)^2 / beta, & \text{if } |x_i - y_i| < beta \\
+        |x_i - y_i| - 0.5 * beta, & \text{otherwise }
         \end{cases}
 
     :math:`x` and :math:`y` arbitrary shapes with a total of :math:`n` elements each
     the sum operation still operates over all the elements, and divides by :math:`n`.
 
+    beta is an optional parameter that defaults to 1.
+
+    Note: When beta is set to 0, this is equivalent to :class:`L1Loss`.
+    Passing a negative value in for beta will result in an exception.
+
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args:
@@ -795,6 +801,8 @@ class SmoothL1Loss(_Loss):
             elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
+            This value defaults to 1.0.
 
     Shape:
         - Input: :math:`(N, *)` where :math:`*` means, any number of additional
@@ -806,11 +814,12 @@ class SmoothL1Loss(_Loss):
     """
     __constants__ = ['reduction']
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
         super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+        self.beta = beta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.smooth_l1_loss(input, target, reduction=self.reduction)
+        return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
 
 
 class SoftMarginLoss(_Loss):
@@ -855,7 +864,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 
 class CrossEntropyLoss(_WeightedLoss):
-    r"""This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
+    r"""This criterion combines :class:`~torch.nn.LogSoftmax` and :class:`~torch.nn.NLLLoss` in one single class.
 
     It is useful when training a classification problem with `C` classes.
     If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
@@ -1191,6 +1200,9 @@ class TripletMarginLoss(_Loss):
     .. math::
         d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
 
+    See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
+    triplet margin loss for input tensors using a custom distance function.
+
     Args:
         margin (float, optional): Default: :math:`1`.
         p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
@@ -1215,7 +1227,8 @@ class TripletMarginLoss(_Loss):
 
     Shape:
         - Input: :math:`(N, D)` where :math:`D` is the vector dimension.
-        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+            otherwise.
 
     >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
     >>> anchor = torch.randn(100, 128, requires_grad=True)
@@ -1246,6 +1259,120 @@ def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
                                      eps=self.eps, swap=self.swap, reduction=self.reduction)
 
 
+class TripletMarginWithDistanceLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given input
+    tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+    positive, and negative examples, respectively), and a nonnegative,
+    real-valued function ("distance function") used to compute the relationship
+    between the anchor and positive example ("positive distance") and the
+    anchor and negative example ("negative distance").
+
+    The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
+    can be described as:
+
+    .. math::
+        \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+    where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
+    quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
+    and :math:`margin` is a nonnegative margin representing the minimum difference
+    between the positive and negative distances that is required for the loss to
+    be 0.  The input tensors have :math:`N` elements each and can be of any shape
+    that the distance function can handle.
+
+    If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
+    loss for input tensors using the :math:`l_p` distance as the distance function.
+
+    Args:
+        distance_function (callable, optional): A nonnegative, real-valued function that
+            quantifies the closeness of two tensors. If not specified,
+            `nn.PairwiseDistance` will be used.  Default: ``None``
+        margin (float, optional): A nonnegative margin representing the minimum difference
+            between the positive and negative distances required for the loss to be 0. Larger
+            margins penalize cases where the negative examples are not distant enough from the
+            anchors, relative to the positives. Default: :math:`1`.
+        swap (bool, optional): Whether to use the distance swap described in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
+            negative example than the anchor is, swaps the positive example and the anchor in
+            the loss computation. Default: ``False``.
+        reduction (string, optional): Specifies the (optional) reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
+          as supported by the distance function.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+          otherwise.
+
+    Examples::
+
+    >>> # Initialize embeddings
+    >>> embedding = nn.Embedding(1000, 128)
+    >>> anchor_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> positive_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> negative_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> anchor = embedding(anchor_ids)
+    >>> positive = embedding(positive_ids)
+    >>> negative = embedding(negative_ids)
+    >>>
+    >>> # Built-in Distance Function
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function
+    >>> def l_infinity(x1, x2):
+    >>>     return torch.max(torch.abs(x1 - x2), dim=1).values
+    >>>
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5)
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function (Lambda)
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(
+    >>>         distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    Reference:
+        V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
+        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+    """
+    __constants__ = ['margin', 'swap', 'reduction']
+    margin: float
+    swap: bool
+
+    def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+                 margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
+        super(TripletMarginWithDistanceLoss, self).__init__(size_average=None, reduce=None, reduction=reduction)
+        self.distance_function = distance_function if distance_function is not None else PairwiseDistance()
+        self.margin = margin
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        return F.triplet_margin_with_distance_loss(anchor, positive, negative,
+                                                   distance_function=self.distance_function,
+                                                   margin=self.margin, swap=self.swap, reduction=self.reduction)
+
+
 class CTCLoss(_Loss):
     r"""The Connectionist Temporal Classification loss.
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 30e732e6d859..2facc5e0c6eb 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -349,7 +349,7 @@ def add_module(self, name: str, module: Optional['Module']) -> None:
         elif hasattr(self, name) and name not in self._modules:
             raise KeyError("attribute '{}' already exists".format(name))
         elif '.' in name:
-            raise KeyError("module name can't contain \".\"")
+            raise KeyError("module name can't contain \".\", got: {}".format(name))
         elif name == '':
             raise KeyError("module name can't be empty string \"\"")
         self._modules[name] = module
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index e6589b9ef1d9..b8da2a877dd9 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -24,7 +24,7 @@ def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tens
 class RNNBase(Module):
     __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
                      'batch_first', 'dropout', 'bidirectional']
-    __ignored_properties__ = ['all_weights']
+    __jit_unused_properties__ = ['all_weights']
 
     mode: str
     input_size: int
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index b6997ca7701a..f063ffa2e8eb 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -186,11 +186,11 @@ class EmbeddingBag(Module):
     r"""Computes sums or means of 'bags' of embeddings, without instantiating the
     intermediate embeddings.
 
-    For bags of constant length and no :attr:`per_sample_weights`, this class
+    For bags of constant length and no :attr:`per_sample_weights` and 2D inputs, this class
 
-        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
-        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
-        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
 
     However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
     operations.
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1425f73dd365..894175e8c12f 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -4,6 +4,7 @@
 import os
 import inspect
 import logging
+import warnings
 
 import torch
 
@@ -111,37 +112,36 @@ class DistributedDataParallel(Module):
     :class:`torch.nn.DataParallel` for single-node multi-GPU data
     parallel training.
 
-    Here is how to use it: on each host with N GPUs, you should spawn up N
-    processes, while ensuring that each process individually works on a single GPU
-    from 0 to N-1. Therefore, it is your job to ensure that your training script
-    operates on a single given GPU by calling:
+    To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
+    up ``N`` processes, ensuring that each process exclusively works on a single
+    GPU from 0 to N-1. This can be done by either setting
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling:
 
         >>> torch.cuda.set_device(i)
 
     where i is from 0 to N-1. In each process, you should refer the following
     to construct this module:
 
-        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+        >>> torch.distributed.init_process_group(
+        >>>     backend='nccl', world_size=N, init_method='...'
+        >>> )
         >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
 
     In order to spawn up multiple processes per node, you can use either
-    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``
+    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
 
     .. note ::
         Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
         for a brief introduction to all features related to distributed training.
 
-    .. note:: ``nccl`` backend is currently the fastest and
-        highly recommended backend to be used with Multi-Process Single-GPU
-        distributed training and this applies to both single-node and multi-node
-        distributed training
+    .. note:: ``nccl`` backend is currently the fastest and highly recommended
+        backend when using GPUs. This applies to both single-node and
+        multi-node distributed training.
 
     .. note:: This module also supports mixed-precision distributed training.
         This means that your model can have different types of parameters such
-        as mixed types of fp16 and fp32, the gradient reduction on these
+        as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these
         mixed types of parameters will just work fine.
-        Also note that ``nccl`` backend is currently the fastest and highly
-        recommended backend for fp16/fp32 mixed-precision training.
 
     .. note:: If you use ``torch.save`` on one process to checkpoint the module,
         and ``torch.load`` on some other processes to recover it, make sure that
@@ -154,16 +154,56 @@ class DistributedDataParallel(Module):
         trained on a single node with ``batch=M*N`` (because the gradients
         between different nodes are averaged). You should take this into
         consideration when you want to obtain a mathematically equivalent
-        training process compared to the non-DistributedDataParallel
-        counterpart.
+        training process compared to the local training counterpart.
 
-    .. warning::
-        This module works only with the ``gloo`` and ``nccl`` backends.
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. note::
+        If you are using DistributedDataParallel in conjunction with the
+        :ref:`distributed-rpc-framework`, you should always use
+        :meth:`torch.distributed.autograd.backward` to compute gradients and
+        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        parameters.
+
+        Example::
+
+            >>> import torch.distributed.autograd as dist_autograd
+            >>> from torch.nn.parallel import DistributedDataParallel as DDP
+            >>> from torch import optim
+            >>> from torch.distributed.optim import DistributedOptimizer
+            >>> from torch.distributed.rpc import RRef
+            >>>
+            >>> t1 = torch.rand((3, 3), requires_grad=True)
+            >>> t2 = torch.rand((3, 3), requires_grad=True)
+            >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
+            >>> ddp_model = DDP(my_model)
+            >>>
+            >>> # Setup optimizer
+            >>> optimizer_params = [rref]
+            >>> for param in ddp_model.parameters():
+            >>>     optimizer_params.append(RRef(param))
+            >>>
+            >>> dist_optim = DistributedOptimizer(
+            >>>     optim.SGD,
+            >>>     optimizer_params,
+            >>>     lr=0.05,
+            >>> )
+            >>>
+            >>> with dist_autograd.context() as context_id:
+            >>>     pred = ddp_model(rref.to_here())
+            >>>     loss = loss_func(pred, loss)
+            >>>     dist_autograd.backward(context_id, loss)
+            >>>     dist_optim.step()
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
-        function of the output of this module) is a distributed synchronization
-        point. Take that into account in case different processes might be
+        function of the output of this module) are distributed synchronization
+        points. Take that into account in case different processes might be
         executing different code.
 
     .. warning::
@@ -174,7 +214,7 @@ class DistributedDataParallel(Module):
     .. warning::
         This module assumes all parameters are registered in the model of each
         distributed processes are in the same order. The module itself will
-        conduct gradient all-reduction following the reverse order of the
+        conduct gradient ``allreduce`` following the reverse order of the
         registered parameters of the model. In other words, it is users'
         responsibility to ensure that each distributed process has the exact
         same model and thus the exact same parameter registration order.
@@ -207,117 +247,84 @@ class DistributedDataParallel(Module):
 
     .. warning::
         You should never try to change your model's parameters after wrapping
-        up your model with DistributedDataParallel. In other words, when
-        wrapping up your model with DistributedDataParallel, the constructor of
-        DistributedDataParallel will register the additional gradient
+        up your model with ``DistributedDataParallel``. Because, when
+        wrapping up your model with ``DistributedDataParallel``, the constructor
+        of ``DistributedDataParallel`` will register the additional gradient
         reduction functions on all the parameters of the model itself at the
-        time of construction. If you change the model's parameters after
-        the DistributedDataParallel construction, this is not supported and
-        unexpected behaviors can happen, since some parameters' gradient
-        reduction functions might not get called.
-
-    .. note::
-        Parameters are never broadcast between processes. The module performs
-        an all-reduce step on gradients and assumes that they will be modified
-        by the optimizer in all processes in the same way. Buffers
-        (e.g. BatchNorm stats) are broadcast from the module in process of rank
-        0, to all other replicas in the system in every iteration.
-
-    .. note::
-        If you are using DistributedDataParallel in conjunction with the
-        :ref:`distributed-rpc-framework`, you should always use
-        :meth:`torch.distributed.autograd.backward` to compute gradients and
-        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        time of construction. If you change the model's parameters afterwards,
+        gradient redunction functions no longer match the correct set of
         parameters.
 
-    Example::
-
-        >>> import torch.distributed.autograd as dist_autograd
-        >>> from torch.nn.parallel import DistributedDataParallel as DDP
-        >>> from torch import optim
-        >>> from torch.distributed.optim import DistributedOptimizer
-        >>> from torch.distributed.rpc import RRef
-        >>>
-        >>> t1 = torch.rand((3, 3), requires_grad=True)
-        >>> t2 = torch.rand((3, 3), requires_grad=True)
-        >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
-        >>> ddp_model = DDP(my_model)
-        >>>
-        >>> # Setup optimizer
-        >>> optimizer_params = [rref]
-        >>> for param in ddp_model.parameters():
-        >>>     optimizer_params.append(RRef(param))
-        >>>
-        >>> dist_optim = DistributedOptimizer(
-        >>>     optim.SGD,
-        >>>     optimizer_params,
-        >>>     lr=0.05,
-        >>> )
-        >>>
-        >>> with dist_autograd.context() as context_id:
-        >>>     pred = ddp_model(rref.to_here())
-        >>>     loss = loss_func(pred, loss)
-        >>>     dist_autograd.backward(context_id, loss)
-        >>>     dist_optim.step()
-
     .. warning::
-        Using DistributedDataParallel in conjuction with the
+        Using ``DistributedDataParallel`` in conjunction with the
         :ref:`distributed-rpc-framework` is experimental and subject to change.
 
+    .. warning::
+        The ``gradient_as_bucket_view`` mode  does not yet work with Automatic
+        Mixed Precision (AMP). AMP maintains stashed gradients that are used for
+        unscaling gradients. With ``gradient_as_bucket_view=True``, these
+        stashed gradients will point to communication buckets in the first
+        iteration. In the next iteration, the communication buckets are mutated
+        and thus these stashed gradients will be unexpectedly mutated as well,
+        which might lead to wrong results.
+
     Args:
         module (Module): module to be parallelized
         device_ids (list of int or torch.device): CUDA devices. This should
                    only be provided when the input module resides on a single
-                   CUDA device. For single-device modules, the ``i``th
+                   CUDA device. For single-device modules, the i'th
                    :attr:`module` replica is placed on ``device_ids[i]``. For
-                   multi-device modules and CPU modules, device_ids must be None
-                   or an empty list, and input data for the forward pass must be
-                   placed on the correct device. (default: all devices for
-                   single-device modules)
-        output_device (int or torch.device): device location of output for
+                   multi-device modules and CPU modules, ``device_ids`` must be
+                   ``None`` or an empty list, and input data for the forward
+                   pass must be placed on the correct device. (default: all
+                   visible devices for single-device modules)
+        output_device (int or torch.device): Device location of output for
                       single-device CUDA modules. For multi-device modules and
-                      CPU modules, it must be None, and the module itself
-                      dictates the output location. (default: device_ids[0] for
-                      single-device modules)
-        broadcast_buffers (bool): flag that enables syncing (broadcasting) buffers of
-                          the module at beginning of the forward function.
-                          (default: ``True``)
-        process_group: the process group to be used for distributed data
+                      CPU modules, it must be ``None``, and the module itself
+                      dictates the output location. (default: ``device_ids[0]``
+                      for single-device modules)
+        broadcast_buffers (bool): Flag that enables syncing (broadcasting)
+                          buffers of the module at beginning of the ``forward``
+                          function. (default: ``True``)
+        process_group: The process group to be used for distributed data
                        all-reduction. If ``None``, the default process group, which
-                       is created by ```torch.distributed.init_process_group```,
+                       is created by :func:`torch.distributed.init_process_group`,
                        will be used. (default: ``None``)
-        bucket_cap_mb: DistributedDataParallel will bucket parameters into
+        bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into
                        multiple buckets so that gradient reduction of each
                        bucket can potentially overlap with backward computation.
-                       :attr:`bucket_cap_mb` controls the bucket size in MegaBytes (MB)
-                       (default: 25)
-        find_unused_parameters (bool): Traverse the autograd graph of all tensors
-                                       contained in the return value of the wrapped
-                                       module's ``forward`` function.
-                                       Parameters that don't receive gradients as
-                                       part of this graph are preemptively marked
-                                       as being ready to be reduced. Note that all
-                                       ``forward`` outputs that are derived from
-                                       module parameters must participate in
-                                       calculating loss and later the gradient
-                                       computation. If they don't, this wrapper will
-                                       hang waiting for autograd to produce gradients
-                                       for those parameters. Any outputs derived from
-                                       module parameters that are otherwise unused can
-                                       be detached from the autograd graph using
-                                       ``torch.Tensor.detach``. (default: ``False``)
-        check_reduction: when setting to ``True``, it enables DistributedDataParallel
-                         to automatically check if the previous iteration's
-                         backward reductions were successfully issued at the
-                         beginning of every iteration's forward function.
-                         You normally don't need this option enabled unless you
-                         are observing weird behaviors such as different ranks
-                         are getting different gradients, which should not
-                         happen if DistributedDataParallel is correctly used.
-                         (default: ``False``)
+                       :attr:`bucket_cap_mb` controls the bucket size in
+                       MegaBytes (MB). (default: 25)
+        find_unused_parameters (bool): Traverse the autograd graph from all
+                               tensors contained in the return value of the
+                               wrapped module's ``forward`` function. Parameters
+                               that don't receive gradients as part of this
+                               graph are preemptively marked as being ready to
+                               be reduced. Note that all ``forward`` outputs
+                               that are derived from module parameters must
+                               participate in calculating loss and later the
+                               gradient computation. If they don't, this wrapper
+                               will hang waiting for autograd to produce
+                               gradients for those parameters. Any outputs
+                               derived from module parameters that are otherwise
+                               unused can be detached from the autograd graph
+                               using ``torch.Tensor.detach``. (default: ``False``)
+        check_reduction: This argument is deprecated.
+        gradient_as_bucket_view (bool): This is a prototype feature and subject
+                      to changes. When set to ``True``, gradients will be views
+                      pointing to different offsets of ``allreduce`` communication
+                      buckets. This can reduce peak memory usage, where the
+                      saved memory size will be equal to the total gradients
+                      size. Moreover, it avoids the overhead of copying between
+                      gradients and ``allreduce`` communication buckets. When
+                      gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by
+                      referring to the :meth:`~torch.optim.Optimizer.zero_grad`
+                      function in ``torch/optim/optimizer.py`` as a solution.
+
 
     Attributes:
-        module (Module): the module to be parallelized
+        module (Module): the module to be parallelized.
 
     Example::
 
@@ -329,7 +336,8 @@ def __init__(self, module, device_ids=None,
                  process_group=None,
                  bucket_cap_mb=25,
                  find_unused_parameters=False,
-                 check_reduction=False):
+                 check_reduction=False,
+                 gradient_as_bucket_view=False):
 
         super(DistributedDataParallel, self).__init__()
 
@@ -380,11 +388,20 @@ def __init__(self, module, device_ids=None,
         self.require_backward_grad_sync = True
         self.require_forward_param_sync = True
         self.ddp_join_enabled = False
+        self.gradient_as_bucket_view = gradient_as_bucket_view
+        if hasattr(module, '_ddp_params_and_buffers_to_ignore'):
+            self.parameters_to_ignore = module._ddp_params_and_buffers_to_ignore
+        else:
+            self.parameters_to_ignore = []
 
         if check_reduction:
             # This argument is no longer used since the reducer
             # will ensure reduction completes even if some parameters
             # do not receive gradients.
+            warnings.warn(
+                "The `check_reduction` argument in `DistributedDataParallel` "
+                "module is deprecated. Please avoid using it."
+            )
             pass
 
         # used for intra-node param sync and inter-node sync as well
@@ -399,7 +416,11 @@ def __init__(self, module, device_ids=None,
         self._ddp_init_helper()
 
     def _sync_params_and_buffers(self, authoritative_rank=0):
-        module_states = list(self.module.state_dict().values())
+        module_states = []
+        for name, param in self.module.state_dict().items():
+            if name not in self.parameters_to_ignore:
+                module_states.append(param)
+
         if len(module_states) > 0:
             self._distributed_broadcast_coalesced(
                 module_states,
@@ -431,7 +452,6 @@ def model_parameters(m):
 
         if self.device_ids and len(self.device_ids) > 1:
 
-            import warnings
             warnings.warn(
                 "Single-Process Multi-GPU is not the recommended mode for "
                 "DDP. In this mode, each DDP instance operates on multiple "
@@ -466,17 +486,55 @@ def model_parameters(m):
             self._module_copies = [self.module]
 
         self.modules_params = [list(parameters(m)) for m in self._module_copies]
-        self.modules_buffers = [list(m.buffers()) for m in self._module_copies]
-
-        # Build tuple of (module, parameter) for all parameters that require grads.
-        modules_and_parameters = [
+        # Collect buffers for modules, filtering out buffers that should be ignored.
+        named_module_buffers = [
+            [(buffer, buffer_name) for buffer_name, buffer in m.named_buffers()]
+            for m in self._module_copies
+        ]
+        self.modules_buffers = [
             [
-                (module, parameter)
-                for module in replica.modules()
-                for parameter in filter(
-                    lambda parameter: parameter.requires_grad,
-                    parameters(module, recurse=False))
-            ] for replica in self._module_copies]
+                buffer
+                for (buffer, buffer_name) in module_buffers
+                if buffer_name not in self.parameters_to_ignore
+            ]
+            for module_buffers in named_module_buffers
+        ]
+        # Build tuple of (module, parameter) for all parameters that require grads.
+        if self.device_ids and len(self.device_ids) > 1:
+            # Single-process multi-device mode,does not support self.parameters_to_ignore.
+            if self.parameters_to_ignore:
+                raise ValueError(
+                    "Single-Process multi-device mode does not "
+                    "support ignoring parameters upfront. Please consider "
+                    "using one DDP instance per device."
+                )
+
+            modules_and_parameters = [
+                [
+                    (module, parameter)
+                    for module in replica.modules()
+                    for parameter in filter(
+                        lambda parameter: parameter.requires_grad,
+                        parameters(module, recurse=False))
+                ] for replica in self._module_copies]
+        else:
+            modules_and_parameters = [
+                [
+                    (module, parameter)
+                    for module_name, module in replica.named_modules()
+                    for parameter in [
+                        param
+                        # Note that we access module.named_parameters instead of
+                        # parameters(module). parameters(module) is only needed in the
+                        # single-process multi device case, where it accesses replicated
+                        # parameters through _former_parameters.
+                        for param_name, param in module.named_parameters(recurse=False)
+                        if param.requires_grad
+                        and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+                    ]
+                ]
+                for replica in self._module_copies
+            ]
 
         # Build list of parameters.
         parameters = [
@@ -516,7 +574,8 @@ def produces_sparse_gradient(module):
             self.process_group,
             expect_sparse_gradient,
             self.bucket_bytes_cap,
-            self.find_unused_parameters)
+            self.find_unused_parameters,
+            self.gradient_as_bucket_view)
 
         # passing a handle to torch.nn.SyncBatchNorm layer
         self._passing_sync_batchnorm_handle(self._module_copies)
@@ -563,9 +622,9 @@ def no_sync(self):
 
             >>> ddp = torch.nn.DistributedDataParallel(model, pg)
             >>> with ddp.no_sync():
-            ...   for input in inputs:
-            ...     ddp(input).backward()  # no synchronization, accumulate grads
-            ... ddp(another_input).backward()  # synchronize grads
+            >>>   for input in inputs:
+            >>>     ddp(input).backward()  # no synchronization, accumulate grads
+            >>> ddp(another_input).backward()  # synchronize grads
         """
         old_require_backward_grad_sync = self.require_backward_grad_sync
         self.require_backward_grad_sync = False
@@ -581,7 +640,7 @@ def forward(self, *inputs, **kwargs):
             )
             work = dist.all_reduce(ones, group=self.process_group, async_op=True)
             self.reducer._set_forward_pass_work_handle(
-                work, ones, self.ddp_join_divide_by_initial_world_size
+                work, self.ddp_join_divide_by_initial_world_size
             )
 
         # Calling _rebuild_buckets before forward compuation,
@@ -782,7 +841,9 @@ def join(self, divide_by_initial_world_size=True, enable=True):
           >>>      dist.init_process_group("nccl", rank=rank, world_size=2)
           >>>      torch.cuda.set_device(rank)
           >>>      model = nn.Linear(1, 1, bias=False).to(rank)
-          >>>      model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
+          >>>      model = torch.nn.parallel.DistributedDataParallel(
+          >>>          model, device_ids=[rank], output_device=rank
+          >>>      )
           >>>      # Rank 1 gets one more input than rank 0.
           >>>      inputs = [torch.tensor([1]).float() for _ in range(10 + rank)]
           >>>      with model.join():
@@ -815,8 +876,20 @@ def join(self, divide_by_initial_world_size=True, enable=True):
             if enable and not has_error:
                 all_procs_joined = False
                 is_last_joiner = True
-                # Schedules allreduce to match fwd pass allreduce in non-joined procs
+                i = 0
+                WARN_THRESHOLD = 1000
+                warnings.simplefilter("once")
                 while not all_procs_joined:
+                    if i > WARN_THRESHOLD:
+                        my_rank = dist.get_rank(self.process_group)
+                        warnings.warn(
+                            "Detected uneven input skew of greater "
+                            f"than {WARN_THRESHOLD}. This means that rank {my_rank} "
+                            f"has at least {WARN_THRESHOLD} fewer inputs than "
+                            "other currently active ranks. This level of skew could "
+                            "lead to performance degradation during training."
+                        )
+                    # Schedules allreduce to match fwd pass allreduce in non-joined procs
                     num_active_procs = self._schedule_shadow_all_reduce_for_fwd_pass()
                     if num_active_procs == 0:
                         all_procs_joined = True
@@ -853,6 +926,7 @@ def join(self, divide_by_initial_world_size=True, enable=True):
                             self._match_unused_params_allreduce()
                         # It will push rebuilt params only once during training period
                         self.reducer._push_all_rebuilt_params()
+                        i += 1
 
                 # All procs joined. Agree on authoritative rank and broadcast the model.
                 self._sync_final_model(is_last_joiner)
@@ -1060,3 +1134,12 @@ def _check_comm_hook(self, hook):
             raise ValueError(
                 "Communication hook: return annotation should be torch.futures.Future or torch._C.Future."
             )
+
+    @staticmethod
+    def _set_params_and_buffers_to_ignore_for_model(
+        module, params_and_buffers_to_ignore
+    ):
+        # This is a workaround to set parameters and buffers DDP should ignore
+        # during synchronization. It will be removed when the API is finalized
+        # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
+        module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index 1635d40e29e8..a90d85f037c3 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -1,6 +1,12 @@
 import torch
 from ._functions import Scatter, Gather
 
+def _is_namedtuple(obj):
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    return (
+        isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
+    )
+
 
 def scatter(inputs, target_gpus, dim=0):
     r"""
@@ -11,6 +17,8 @@ def scatter(inputs, target_gpus, dim=0):
     def scatter_map(obj):
         if isinstance(obj, torch.Tensor):
             return Scatter.apply(target_gpus, None, dim, obj)
+        if _is_namedtuple(obj):
+            return list(type(obj)(*args) for args in zip(*map(scatter_map, obj)))
         if isinstance(obj, tuple) and len(obj) > 0:
             return list(zip(*map(scatter_map, obj)))
         if isinstance(obj, list) and len(obj) > 0:
diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py
index 63fb4b0fa1fd..7daeecddd4e1 100644
--- a/torch/nn/qat/modules/conv.py
+++ b/torch/nn/qat/modules/conv.py
@@ -32,7 +32,7 @@ def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -40,9 +40,8 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
         if type(mod) == ConvReLU2d:
             mod = mod[0]
         qconfig = mod.qconfig
diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py
index 77998426239f..47fc40b9b6c0 100644
--- a/torch/nn/qat/modules/linear.py
+++ b/torch/nn/qat/modules/linear.py
@@ -30,7 +30,7 @@ def forward(self, input):
         return F.linear(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -38,9 +38,8 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
         if type(mod) == LinearReLU:
             mod = mod[0]
 
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index 5985104eaf07..f2b090370ed3 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -1,10 +1,12 @@
 r""" Functional interface (quantized)."""
 from typing import List, Optional
+import warnings
 
 import torch
 from torch import Tensor
 from torch.nn.modules.utils import _pair, _triple
 from torch.nn.quantized.modules.utils import _pair_from_first
+from torch.jit.annotations import BroadcastingList2
 
 # Although some of the functions and docstrings are mirrored from the torch.nn,
 # we want to have them here for future changes.
@@ -71,8 +73,7 @@ def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False,
                                           ceil_mode, count_include_pad,
                                           divisor_override)
 
-def adaptive_avg_pool2d(input, output_size):
-    # type: (Tensor, BroadcastingList2[int]) -> Tensor
+def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
     r"""
     Applies a 2D adaptive average pooling over a quantized input signal composed
     of several quantized input planes.
@@ -89,8 +90,7 @@ def adaptive_avg_pool2d(input, output_size):
         raise ValueError("Input to 'quantized.functional.adaptive_avg_pool2d' must be quantized!")
     return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
 
-def adaptive_avg_pool3d(input, output_size):
-    # type: (Tensor, BroadcastingList2[int]) -> Tensor
+def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
     r"""
     Applies a 3D adaptive average pooling over a quantized input signal composed
     of several quantized input planes.
@@ -327,8 +327,10 @@ def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corne
     return torch.nn.functional.interpolate(input, size, scale_factor, mode,
                                            align_corners)
 
-def linear(input, weight, bias=None, scale=None, zero_point=None):
-    # type: (Tensor, Tensor, Optional[Tensor], Optional[float], Optional[int]) -> Tensor
+def linear(
+    input: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+    scale: Optional[float] = None, zero_point: Optional[int] = None
+) -> Tensor:
     r"""
     Applies a linear transformation to the incoming quantized data:
     :math:`y = xA^T + b`.
@@ -360,6 +362,22 @@ def linear(input, weight, bias=None, scale=None, zero_point=None):
     _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
     return torch.ops.quantized.linear(input, _packed_params, scale, zero_point)
 
+def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 1D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.nn.quantized.MaxPool1d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.nn.functional.max_pool1d(input, kernel_size, stride, padding,
+                                          dilation, ceil_mode, return_indices)
+
 def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
                ceil_mode=False, return_indices=False):
     r"""Applies a 2D max pooling over a quantized input signal composed of
@@ -376,8 +394,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
     return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding,
                                           dilation, ceil_mode, return_indices)
 
-def celu(input, scale, zero_point, alpha=1.):
-    # type: (Tensor, float, int, Optional[float]) -> Tensor
+def celu(input: Tensor, scale: float, zero_point: int, alpha: Optional[float] = 1.) -> Tensor:
     r"""celu(input, scale, zero_point, alpha=1.) -> Tensor
 
     Applies the quantized CELU function element-wise.
@@ -393,8 +410,7 @@ def celu(input, scale, zero_point, alpha=1.):
     return torch.ops.quantized.celu(input, scale, zero_point, alpha)
 
 
-def relu(input, inplace=False):
-    # type: (Tensor, bool) -> Tensor
+def relu(input: Tensor, inplace: bool = False) -> Tensor:
     r"""relu(input, inplace=False) -> Tensor
 
     Applies the rectified linear unit function element-wise.
@@ -411,9 +427,8 @@ def relu(input, inplace=False):
     else:
         return torch.relu(input)
 
-def leaky_relu(input, negative_slope=0.01, inplace=False,
-               scale=None, zero_point=None):
-    # type: (Tensor, float, bool, float, int) -> Tensor
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False,
+               scale: float = None, zero_point: int = None):
     r"""
     Quantized version of the.
     leaky_relu(input, negative_slope=0.01, inplace=False, scale, zero_point) -> Tensor
@@ -441,8 +456,7 @@ def leaky_relu(input, negative_slope=0.01, inplace=False,
         result = torch._C._nn.leaky_relu(input, negative_slope)
     return result
 
-def hardtanh(input, min_val=-1., max_val=1., inplace=False):
-    # type: (Tensor, float, float, bool) -> Tensor
+def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace: bool = False) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardtanh`.
     """
     if not input.is_quantized:
@@ -451,8 +465,7 @@ def hardtanh(input, min_val=-1., max_val=1., inplace=False):
         return torch._C._nn.hardtanh_(input, min_val, max_val)
     return torch._C._nn.hardtanh(input, min_val, max_val)
 
-def hardswish(input, scale, zero_point):
-    # type: (Tensor, float, int) -> Tensor
+def hardswish(input: Tensor, scale: float, zero_point: int) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardswish`.
 
     Args:
@@ -464,8 +477,7 @@ def hardswish(input, scale, zero_point):
         raise ValueError("Input to 'quantized.hardswish' must be quantized!")
     return torch._ops.ops.quantized.hardswish(input, scale, zero_point)
 
-def threshold(input, threshold, value):
-    # type: (Tensor, float, float) -> Tensor
+def threshold(input: Tensor, threshold: float, value: float) -> Tensor:
     r"""Applies the quantized version of the threshold function element-wise:
 
     .. math::
@@ -484,8 +496,7 @@ def threshold(input, threshold, value):
         raise ValueError("Input to 'value' must be specified!")
     return torch._ops.ops.quantized.threshold(input, threshold, value)
 
-def elu(input, scale, zero_point, alpha=1.):
-    # type: (Tensor, float, int, float) -> Tensor
+def elu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.elu`.
 
     Args:
@@ -498,16 +509,14 @@ def elu(input, scale, zero_point, alpha=1.):
         raise ValueError("Input to 'quantized.elu' must be quantized!")
     return torch.ops.quantized.elu(input, scale, zero_point, alpha)
 
-def hardsigmoid(input):
-    # type: (Tensor) -> Tensor
+def hardsigmoid(input: Tensor) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardsigmoid`.
     """
     if not input.is_quantized:
         raise ValueError("Input to 'quantized.hardsigmoid' must be quantized!")
     return torch._C._nn.hardsigmoid(input)
 
-def clamp(input, min_, max_):
-    # type: (Tensor, float, float) -> Tensor
+def clamp(input: Tensor, min_: float, max_: float) -> Tensor:
     r"""float(input, min_, max_) -> Tensor
 
     Applies the clamp function element-wise.
diff --git a/torch/nn/quantized/modules/__init__.py b/torch/nn/quantized/modules/__init__.py
index 40b11c89ef90..8b286e549460 100644
--- a/torch/nn/quantized/modules/__init__.py
+++ b/torch/nn/quantized/modules/__init__.py
@@ -35,6 +35,9 @@ class Quantize(torch.nn.Module):
                 [ 1., -1.]], size=(2, 2), dtype=torch.qint8, scale=1.0, zero_point=2)
     """
 
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
     def __init__(self, scale, zero_point, dtype):
         super(Quantize, self).__init__()
         self.register_buffer('scale', torch.tensor([scale]))
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index fe1ced91624f..31c914d2bf35 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -146,7 +146,7 @@ def __setstate__(self, state):
 
     @classmethod
     def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
-        r"""Creates a qconv object and returns it. 
+        r"""Creates a qconv object and returns it.
         """
         if weight_post_process is None:
             weight_post_process = mod.qconfig.weight()
@@ -519,15 +519,22 @@ class ConvTranspose1d(_ConvTransposeNd):
     composed of several input planes.
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose1d`.
+
     .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
     For special notes, please, see :class:`~torch.nn.quantized.Conv1d`
+
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
                              parameter.
         scale (Tensor):      scalar for the output scale
         zero_point (Tensor): scalar for the output zero point
     See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
     Examples::
+
+        >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
@@ -598,15 +605,22 @@ class ConvTranspose2d(_ConvTransposeNd):
     composed of several input planes.
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose2d`.
+
     .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
     For special notes, please, see :class:`~torch.nn.quantized.Conv2d`
+
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
                              parameter.
         scale (Tensor):      scalar for the output scale
         zero_point (Tensor): scalar for the output zero point
     See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
     Examples::
+
+        >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
diff --git a/torch/nn/quantized/modules/utils.py b/torch/nn/quantized/modules/utils.py
index e043db89c647..d531983a6ff5 100644
--- a/torch/nn/quantized/modules/utils.py
+++ b/torch/nn/quantized/modules/utils.py
@@ -1,6 +1,7 @@
 import torch
 from torch._six import container_abcs
 from itertools import repeat
+from torch.nn.modules.module import _addindent
 
 def _quantize_weight(float_wt, observer):
     wt_scale, wt_zp = observer.calculate_qparams()
@@ -25,7 +26,7 @@ def _ntuple_from_first(n):
     """Converts the argument to a tuple of size n
     with the first element repeated."""
     def parse(x):
-        while isinstance(x, container_abcs.Iterable):
+        while isinstance(x, container_abcs.Sequence):
             if len(x) == n:
                 break
             x = x[0]
diff --git a/torch/nn/utils/fusion.py b/torch/nn/utils/fusion.py
index 65b1b7eb0048..e0f512a24f3e 100644
--- a/torch/nn/utils/fusion.py
+++ b/torch/nn/utils/fusion.py
@@ -16,6 +16,10 @@ def fuse_conv_bn_eval(conv, bn):
 def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
     if conv_b is None:
         conv_b = bn_rm.new_zeros(bn_rm.shape)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
     bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
     conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 07462d3f21a5..8932955703a7 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -171,6 +171,8 @@ def _is_none(x):
 def _is_value(x):
     return isinstance(x, torch._C.Value)
 
+def _is_tensor(x):
+    return x.type().isSubtypeOf(torch._C.TensorType.get())
 
 def _is_tensor_list(x):
     return isinstance(x.type(), torch._C.ListType) and isinstance(x.type().getElementType(), torch._C.TensorType)
@@ -222,8 +224,12 @@ def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False)
 
 def _is_fp(value):
     if value:
-        type = value.type().scalarType()
-        return (type == 'Float') or (type == 'Double') or (type == 'Half')
+        if isinstance(value, torch.Tensor):
+            type = value.dtype
+            return (type == 'torch.float32') or (type == 'torch.float64') or (type == 'torch.float16')
+        else:
+            type = value.type().scalarType()
+            return (type == 'Float') or (type == 'Double') or (type == 'Half')
     return False
 
 
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 88bc6d5fb8b5..718b30f8fde3 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -5,12 +5,12 @@
 # This import monkey-patches graph manipulation methods on Graph, used for the
 # ONNX symbolics
 import torch.onnx.utils
-from sys import maxsize
 
 import torch.onnx.symbolic_helper as sym_help
 from torch.onnx.symbolic_helper import parse_args, _unimplemented
 import torch.onnx.symbolic_opset9
 
+from sys import maxsize
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -205,39 +205,11 @@ def embedding_bag(g,
                   include_last_offset):
     if scale_grad_by_freq and sym_help._training_mode:
         return sym_help._onnx_unsupported('embedding_bag with scale_grad_by_freq for training mode')
-
-    from torch.onnx.symbolic_opset9 import size, div, select
-
-    # Check if initial indices was 2D. In functional.py:
-    # offsets is set to torch.arange(0, indices.numel(), indices.size(1))
-    # Then indices is reshaped to 1D: indices.reshape(-1)
-    if len(list(indices.node().inputs())) > 0 and indices.node().inputs().__next__().type().sizes() is not None \
-            and len(indices.node().inputs().__next__().type().sizes()) == 2:
-        # Assert include_last_offset is False
-        assert not include_last_offset
-        embeddings = g.op("Gather", embedding_matrix, indices)
-        dim_0 = size(g, offsets, g.op("Constant", value_t=torch.LongTensor([0])))
-        dim_1 = div(g, size(g, indices, g.op("Constant", value_t=torch.LongTensor([0]))), dim_0)
-        dim_2 = g.op("Constant", value_t=torch.LongTensor([-1]))
-
-        shape = [dim_0, dim_1, dim_2]
-        shape = g.op("Concat", *shape, axis_i=0)
-
-        if not sym_help._is_none(per_sample_weights):
-            per_sample_weights = g.op("Unsqueeze", per_sample_weights, axes_i=[1])
-            embeddings = g.op("Mul", embeddings, per_sample_weights)
-
-        embeddings = g.op("Reshape", embeddings, shape)
-        if mode == 0:
-            embeddings = g.op("ReduceSum", embeddings, axes_i=[1], keepdims_i=0)
-        elif mode == 1:
-            embeddings = g.op("ReduceMean", embeddings, axes_i=[1], keepdims_i=0)
-        else:
-            embeddings = g.op("ReduceMax", embeddings, axes_i=[1], keepdims_i=0)
-        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.          
-        return embeddings, None, None, None
-    elif offsets.type().sizes() is not None:
+    from torch.onnx.symbolic_opset9 import select
+    import warnings
+    warnings.warn("Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+                  "Please use opset 11 or higher to export model for dynamic input shape.'")
+    if offsets.type().sizes() is not None:
         if include_last_offset:
             offset_len = offsets.type().sizes()[0] - 1
             offsets_extended = offsets
@@ -272,7 +244,8 @@ def embedding_bag(g,
         # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
         return output, None, None, None
     else:
-        return sym_help._onnx_unsupported('embedding_bag with unknown shape of indices')
+        return sym_help._onnx_unsupported('embedding_bag with unknown shape of offsets for opset 10 is not supported. '
+                                          'please use opset 11 or higher.')
 
 
 @parse_args('v', 't', 'i', 'i', 'i')
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 01cdd4890633..acbcaf1bbe95 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -6,10 +6,10 @@
 import warnings
 import numpy
 
-from torch.onnx.symbolic_helper import parse_args, _unimplemented
+from torch.onnx.symbolic_helper import parse_args, _unimplemented, _is_tensor_list
 from torch.onnx.symbolic_opset9 import expand, unused
 from torch.nn.modules.utils import _single, _pair, _triple
-
+from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -272,7 +272,7 @@ def masked_scatter(g, self, mask, source):
 
 
 def _len(g, self):
-    if self.type().isSubtypeOf(torch._C.ListType.ofTensors()):
+    if _is_tensor_list(self) or self.node().kind() == "onnx::SplitToSequence":
         return g.op("SequenceLength", self)
     return g.op("Size", self)
 
@@ -538,7 +538,6 @@ def squeeze(g, self, dim=None):
         return g.op("Squeeze", self)
 
     dim = sym_help._get_const(dim, 'i', 'dim')
-
     # create 'cond' node (condition is shape[i]==1)
     dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
     size = sym_help._size_helper(g, self, dim_constant)
@@ -547,10 +546,15 @@ def squeeze(g, self, dim=None):
     # create the 'If' node and add the 'then' and 'else' blocks to it.
     if_node_outputs = g.op("If", cond)
     if_node = if_node_outputs.node()
-    torch.onnx.utils._add_block(if_node, self, "onnx::Squeeze", axes_i=[dim])
-    torch.onnx.utils._add_block(if_node, self, "onnx::Identity")
+    if_block = torch.onnx.utils._add_block(if_node)
+    squeeze_ = if_block.op("Squeeze", self, axes_i=[dim])
+    torch.onnx.utils._add_output_to_block(if_block, squeeze_)
+    else_block = torch.onnx.utils._add_block(if_node)
+    identity_ = else_block.op("Identity", self)
+    torch.onnx.utils._add_output_to_block(else_block, identity_)
     return if_node_outputs
 
+
 @parse_args('v', 'i')
 def unsqueeze(g, self, dim):
     return g.op("Unsqueeze", self, axes_i=[dim])
@@ -712,6 +716,12 @@ def im2col(g, input, kernel_size, dilation, padding, stride):
     return g.op("Reshape", output, output_shape)
 
 
+def narrow(g, input, dim, start, length):
+    from torch.onnx.symbolic_helper import _slice_helper
+    end = g.op("Add", start, length)
+    return _slice_helper(g, input, axes=dim, starts=start, ends=end, dynamic_slice=True)
+
+
 @parse_args('v', 'i', 'i')
 def flatten(g, input, start_dim, end_dim):
     dim = input.type().dim()
@@ -732,3 +742,69 @@ def flatten(g, input, start_dim, end_dim):
         end_dim = dim + end_dim
 
     return sym_help._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@parse_args('v', 'v', 'v', 'i', 'i', 'i', 'v', 'i')
+def embedding_bag(g,
+                  embedding_matrix,
+                  indices,
+                  offsets,
+                  scale_grad_by_freq,
+                  mode,
+                  sparse,
+                  per_sample_weights,
+                  include_last_offset):
+    if scale_grad_by_freq and sym_help._training_mode:
+        return sym_help._onnx_unsupported('embedding_bag with scale_grad_by_freq for training mode')
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = g.op("Unsqueeze",
+                       sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+                       axes_i=[0])
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = sym_help._slice_helper(g, offsets, axes=[0], starts=[0], ends=[maxsize], steps=[1])
+    offsets_ends = sym_help._slice_helper(g, offsets, axes=[0], starts=[1], ends=[maxsize], steps=[1])
+
+    loop_len = sym_help._size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+    loop = g.op("Loop", loop_len, loop_condition)
+
+    loop_block = _add_block(loop.node())
+    block_input_iter = _add_input_to_block(loop_block)
+
+    indices_start = loop_block.op("Gather", offsets_starts, block_input_iter, axis_i=0)
+    indices_end = loop_block.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = loop_block.op("Unsqueeze", indices_start, axes_i=[0])
+    indices_end = loop_block.op("Unsqueeze", indices_end, axes_i=[0])
+
+    indices_row = loop_block.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_block.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not sym_help._is_none(per_sample_weights):
+        per_sample_weights_row = loop_block.op("Slice", per_sample_weights,
+                                               indices_start,
+                                               indices_end,
+                                               zero)
+        per_sample_weights_row = loop_block.op("Unsqueeze", per_sample_weights_row, axes_i=[1])
+        embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
+    elif mode == 1:
+        embeddings = loop_block.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+    else:
+        embeddings = loop_block.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+    _add_output_to_block(loop_block, loop_condition)
+    _add_output_to_block(loop_block, embeddings)
+    # This pass does all required type casting for loop inputs (condition and iter)
+    torch._C._jit_pass_fixup_onnx_loop_node_inputs(loop.node())
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 9a7fb9bc9bc2..e3a6c418fc0a 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -36,15 +36,9 @@ def nll_loss(g, self, target, weight, reduction, ignore_index):
     reduction_vals = ['none', 'mean', 'sum']
     reduction = reduction_vals[reduction]
 
-    # when ignore_index is not specified, ignore_index == onnx::Constant[value={-100}]
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
     ignore_index = sym_help._maybe_get_const(ignore_index, 'i')
-    if ignore_index == -100:
-        if weight.node().mustBeNone():
-            return g.op("NegativeLogLikelihoodLoss", self, target, reduction_s=reduction)
-        else:
-            return g.op("NegativeLogLikelihoodLoss", self, target, weight, reduction_s=reduction)
-
-    # if ignore_index is specified, compute nllloss with no reduction and apply the reduction afterwards
     if weight.node().mustBeNone():
         nllloss = g.op("NegativeLogLikelihoodLoss", self, target, reduction_s=reduction, ignore_index_i=ignore_index)
     else:
@@ -71,7 +65,7 @@ def celu(g, self, alpha):
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
         from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMax', flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -82,7 +76,7 @@ def argmax(g, input, dim, keepdim):
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
         from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMin', flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, 'i')
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 0a1476d88111..c0c1d48ebec0 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -182,20 +182,6 @@ def addmm(g, self, mat1, mat2, beta, alpha):
         return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
 
 
-def view(g, self, size):
-    size = sym_help._maybe_get_const(size, 'is')
-    if sym_help._is_value(size):
-        shape = size
-    else:
-        if self.isCompleteTensor():
-            self_sizes = self.type().sizes()
-            if self_sizes and len(size) == 2 and self_sizes[0] == size[0]:
-                old_type, self = _try_cast_integer_to_float(g, self)
-                return _cast_to_type(g, g.op("Flatten", self, axis_i=1), old_type)
-        shape = g.op("Constant", value_t=torch.LongTensor(size))
-    return g.op("Reshape", self, shape)
-
-
 def flatten(g, input, start_dim, end_dim):
     start_dim_i = sym_help._get_const(start_dim, 'i', 'start_dim')
     end_dim_i = sym_help._get_const(end_dim, 'i', 'end_dim')
@@ -290,5 +276,5 @@ def repeat(g, self, repeats):
         sizes = self.type().sizes()
         diff_dims = repeat_size_len - len(sizes)
         if diff_dims > 0:
-            self = sym_opset9.view(g, self, [1] * diff_dims + sizes)
+            self = sym_opset9.view(g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes)))
     return g.op("Tile", self, repeats)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 0acb254327e3..7e8b04bf1612 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -103,11 +103,11 @@ def mul(g, self, other):
 
 
 def div(g, self, other):
-    return g.op("Div", self, other)
+    return true_divide(g, self, other)
 
 
 def floor_divide(g, self, other):
-    out = div(g, self, other)
+    out = g.op('Div', self, other)
     # the correct operation is truncate, which is not supported in ONNX,
     # we cannot call floor since it will behave differently for negative numbers
     # (eg. -0.1 should become -0 )
@@ -121,6 +121,7 @@ def floor_divide(g, self, other):
     # - self is not fp and other is not fp, the output's type is self's output type
     # - the output type defaults to Float
     scalar_type = self.type().scalarType()
+
     if scalar_type is not None:
         if not sym_help._is_fp(self) and \
            other.type().scalarType() is not None and \
@@ -144,19 +145,19 @@ def true_divide(g, self, other):
     # Case 1: both values are floating
     # Performs div as usual
     if sym_help._is_fp(self) and sym_help._is_fp(other):
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 2: self is floating, other is not
     # Casts other to self's dtype
     if sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 3: other is floating, self is not
     # Casts self to other's dtype
     if sym_help._is_fp(other):
         self = g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other.type().scalarType()])
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 4: neither is floating
     # Casts both inputs to the default scalar type
@@ -168,7 +169,7 @@ def true_divide(g, self, other):
 
     self = g.op("Cast", self, to_i=onnx_scalar_type)
     other = g.op("Cast", other, to_i=onnx_scalar_type)
-    return div(g, self, other)
+    return g.op("Div", self, other)
 
 
 def reciprocal(g, self):
@@ -186,9 +187,11 @@ def stack(g, tensor_list, dim):
     unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in sym_help._unpack_list(tensor_list)]
     return g.op("Concat", *unsqueezed, axis_i=dim)
 
+
 def _list(g, self):
     return self
 
+
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
@@ -218,7 +221,7 @@ def sqrt(g, self):
 
 
 def rsqrt(g, self):
-    return div(g, sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self))
+    return g.op("Div", sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self))
 
 
 def tanh(g, self):
@@ -372,7 +375,7 @@ def expand(g, self, size, implicit):
         # Expand with -1 dim value means dim is unchanged.
         # Since onnx::expand supports two-way broadcasting,
         # -1 dim value can be exported to onnx as 1
-        size = view(g, stack(g, size, 0), [-1])
+        size = view(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1])))
     dtype = 4  # dim type is int64
     ones = ones_like(g, size, dtype)
     neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
@@ -459,13 +462,10 @@ def view(g, self, size):
     if sym_help._is_value(size):
         shape = size
     else:
-        if self.isCompleteTensor():
-            self_sizes = self.type().sizes()
-            if self_sizes and len(size) == 2 and self_sizes[0] == size[0]:
-                return g.op("Flatten", self, axis_i=1)
         shape = g.op("Constant", value_t=torch.LongTensor(size))
     return g.op("Reshape", self, shape)
 
+
 def view_as(g, self, other):
     shape = g.op("Shape", other)
     return g.op("Reshape", self, shape)
@@ -1176,7 +1176,7 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr
 
 @parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i')
 def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
-    sym_help.assert_training_mode(training, "dropout")
+    sym_help.assert_training_mode(training, "batch_norm")
     input_sizes = input.type().sizes()
 
     if weight is None or sym_help._is_none(weight):
@@ -1222,7 +1222,7 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
     variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
     denominator = sqrt(g, add(g, variance, eps_cst))
 
-    layer_norm = div(g, numerator, denominator)
+    layer_norm = g.op("Div", numerator, denominator)
 
     if not (weight is None or sym_help._is_none(weight)):
         layer_norm = mul(g, layer_norm, weight)
@@ -1558,7 +1558,7 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False):
         return g.op("Concat", *input_list, axis_i=0)
     else:
         if dtype is None:
-            dtype = sym_help._maybe_get_const(data, 't').type().scalarType()
+            dtype = data.type().scalarType()
             dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
     return g.op("Cast", data, to_i=sym_help.scalar_type_to_onnx[dtype])
 
@@ -1781,12 +1781,12 @@ def pixel_shuffle(g, self, upscale_factor):
     if len(dims) != 4:
         return _unimplemented("pixel_shuffle", "only support 4d input")
     output_channel = dims[1] // upscale_factor // upscale_factor
-    after_view = view(g, self, [-1, output_channel, upscale_factor, upscale_factor,
-                                dims[2], dims[3]])
+    after_view = view(g, self, g.op("Constant", value_t=torch.tensor([-1, output_channel, upscale_factor,
+                                                                      upscale_factor, dims[2], dims[3]])))
     after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
     return view(g, after_transpose,
-                [-1, output_channel, dims[2] * upscale_factor, dims[3] *
-                 upscale_factor])
+                g.op("Constant", value_t=torch.tensor([-1, output_channel, dims[2] * upscale_factor,
+                                                       dims[3] * upscale_factor])))
 
 
 def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
@@ -2134,7 +2134,7 @@ def narrow(g, input, dim, start, length):
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMax', flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -2144,7 +2144,7 @@ def argmax(g, input, dim, keepdim):
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMin', flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -2451,7 +2451,7 @@ def baddbmm(g, self, batch1, batch2, beta, alpha):
 
 
 def meshgrid(g, tensor_list):
-    tensors = [view(g, t, torch.LongTensor([-1])) for t in sym_help._unpack_list(tensor_list)]
+    tensors = [view(g, t, g.op("Constant", value_t=torch.LongTensor([-1]))) for t in sym_help._unpack_list(tensor_list)]
     tensors_shape = [g.op("Shape", t) for t in tensors]
     out_shape = g.op("Concat", *tensors_shape, axis_i=0)
     out = []
@@ -2473,7 +2473,7 @@ def remainder(g, input, other):
 
 def gelu(g, self):
     _sqrt2 = 1.4142135623730951
-    erf = g.op('Erf', div(g, self, torch.tensor(_sqrt2)))
+    erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2)))
     erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.float)))
     return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.float)))
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index a3b05ff71c61..98dc79d6546c 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -17,7 +17,7 @@
 from torch._six import string_classes
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode
-from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _assign_output_shapes, _check_onnx_proto
+from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto
 
 
 # the flag to tell the user whether it's in the middle of ONNX export or not
@@ -121,7 +121,7 @@ def _split_tensor_list_constants(g, block):
 
 
 def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    params_dict=None, use_new_jit_passes=False):
+                    params_dict=None, use_new_jit_passes=False, dynamic_axes=None, input_names=None):
     # Inline everything
     torch._C._jit_pass_inline(graph)
 
@@ -195,6 +195,11 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
         # onnx only supports tensors, so we turn all out number types into tensors
         torch._C._jit_pass_erase_number_types(graph)
 
+        from torch.onnx.symbolic_helper import _onnx_shape_inference
+        if _onnx_shape_inference:
+            input_names = [] if input_names is None else input_names
+            dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+            torch._C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
         graph = torch._C._jit_pass_onnx(graph, operator_export_type)
         torch._C._jit_pass_lint(graph)
 
@@ -214,6 +219,9 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_lint(graph)
     graph = torch._C._jit_pass_canonicalize(graph)
     torch._C._jit_pass_lint(graph)
+    from torch.onnx.symbolic_helper import _onnx_shape_inference, _export_onnx_opset_version
+    if _onnx_shape_inference:
+        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, _export_onnx_opset_version)
     return graph
 
 
@@ -388,7 +396,8 @@ def _model_to_graph(model, args, verbose=False,
                     example_outputs=None,
                     _retain_param_name=False, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, use_new_jit_passes=False):
+                    training=None, use_new_jit_passes=False,
+                    dynamic_axes=None):
     from torch.onnx.symbolic_helper import _export_onnx_opset_version
     # Special case for common case of passing a single Tensor
     if isinstance(args, torch.Tensor):
@@ -408,19 +417,20 @@ def _model_to_graph(model, args, verbose=False,
     graph = _optimize_graph(graph, operator_export_type,
                             _disable_torch_constant_prop=_disable_torch_constant_prop,
                             fixed_batch_size=fixed_batch_size, params_dict=params_dict,
-                            use_new_jit_passes=use_new_jit_passes)
+                            use_new_jit_passes=use_new_jit_passes,
+                            dynamic_axes=dynamic_axes, input_names=input_names)
+    from torch.onnx.symbolic_helper import _onnx_shape_inference
     if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
         assert example_outputs is not None, "example_outputs must be provided when exporting a ScriptModule or " \
                                             "ScriptFunction."
         out_vars, _ = torch.jit._flatten(tuple(example_outputs))
-        graph = _assign_output_shapes(graph, out_vars)
+        torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, _onnx_shape_inference)
 
     # NB: ONNX requires complete information about output types, which might be
     # erased by some optimizations, so we need to set it explicitly again.
     if torch_out is not None:
         output_tensors, _ = torch._C._jit_flatten(torch_out)
-        for output, tensor in zip(graph.outputs(), output_tensors):
-            output.inferTypeFrom(tensor)
+        torch._C._jit_pass_onnx_assign_output_shape(graph, output_tensors, _onnx_shape_inference)
 
     _set_input_and_output_names(graph, input_names, output_names)
 
@@ -513,12 +523,12 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini
                                   input_names=None, output_names=None, opset_version=None, dynamic_axes=None):
     r"""
     This diagnostic tool runs your model with operator_export_type set to
-    OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of 
+    OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of
     all the ops that are not supported/implemented by the current exporter
 
     operator_export_type is set to OperatorExportTypes.ONNX_FALLTHROUGH by default
         OperatorExportTypes.ONNX_FALLTHROUGH: If an op is not supported
-        in ONNX, fall through and export the operator as is, as a custom 
+        in ONNX, fall through and export the operator as is, as a custom
         ONNX op. Using this mode, the op can be exported and implemented by
         the user for their runtime backend.
         Example graph::
@@ -537,7 +547,7 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini
                 %5 : Float(2:12, 3:4, 4:1, requires_grad=0, device=cpu) = aten::cumsum(%0, %6, %4) # main.py:6:0
                 return (%5)
 
-        In the above example, aten::cumsum in not implemented in opset 9, hence exporter falls 
+        In the above example, aten::cumsum in not implemented in opset 9, hence exporter falls
         through and provides a list of unsupported ops, the result being:
             Unsupported ops : [aten:cumsum]
     """
@@ -614,6 +624,10 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
             val_use_external_data_format, model_file_location = _decide_external_data_format(use_external_data_format,
                                                                                              operator_export_type,
                                                                                              f)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
             graph, params_dict, torch_out = \
                 _model_to_graph(model, args, verbose, input_names,
                                 output_names, operator_export_type,
@@ -621,17 +635,14 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                                 val_do_constant_folding,
                                 fixed_batch_size=fixed_batch_size,
                                 training=training,
-                                use_new_jit_passes=use_new_jit_passes)
+                                use_new_jit_passes=use_new_jit_passes,
+                                dynamic_axes=dynamic_axes)
 
             # TODO: Don't allocate a in-memory string for the protobuf
             defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE
-            if dynamic_axes is None:
-                dynamic_axes = {}
             if custom_opsets is None:
                 custom_opsets = {}
 
-            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
             if export_params:
                 proto, export_map = graph._export_onnx(
                     params_dict, opset_version, dynamic_axes, defer_weight_export,
@@ -827,6 +838,38 @@ def const_if_tensor(arg):
     return tuple(o for o in n.outputs())
 
 
+def _block_op(b, opname, *args, **kwargs):
+    if "::" in opname:
+        aten = False
+        ns_opname = opname
+    else:
+        aten = kwargs.pop("aten", False)
+        ns = "aten" if aten else "onnx"
+        ns_opname = ns + "::" + opname
+    n = b.addNode(ns_opname, list(args))
+    for k, v in sorted(kwargs.items()):
+        # TODO: enable inplace in aten exporting mode.
+        if k == "inplace":
+            continue
+        _add_attribute(n, k, v, aten=aten)
+    if len(list(n.outputs())) == 1:
+        return n.output()
+    return tuple(o for o in n.outputs())
+
+
+def _add_block(node):
+    return node.addBlock()
+
+
+def _add_input_to_block(block):
+    return block.addInputToBlock()
+
+
+def _add_output_to_block(block, value):
+    new_output = block.registerOutput(value)
+    return new_output
+
+
 # Note [Export inplace]
 # ~~~~~~~~~~~~~~~~~~~~~
 # In abstract, it would be better for us to export inplace annotations,
@@ -1080,13 +1123,9 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
                     value_dict[x] = str(key) + '_dynamic_axes_' + str(i + 1)
             dynamic_axes[key] = value_dict
 
-def _add_block(node, input_node, op_name, **kwargs):
-    new_block = node.addBlock()
-    new_node = new_block.addNode(input_node, op_name)
-    for k, v in kwargs.items():
-        _add_attribute(new_node, k, v, False)
 
 torch._C.Graph.op = _graph_op
 torch._C.Graph.at = _graph_at
+torch._C.Block.op = _block_op
 torch._C.Graph.constant = _graph_constant
 torch._C.Node.__getitem__ = _node_getitem
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
new file mode 100644
index 000000000000..2f9b65bfbe3f
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -0,0 +1,24 @@
+"""
+:mod:`torch.optim._multi_tensor` is a package implementing various optimization algorithms.
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can be also easily integrated in the
+future.
+"""
+
+from .adam import Adam
+from .adamw import AdamW
+from .sgd import SGD
+from .rmsprop import RMSprop
+from .rprop import Rprop
+from .asgd import ASGD
+from .adamax import Adamax
+from .adadelta import Adadelta
+
+del adam
+del adamw
+del sgd
+del rmsprop
+del rprop
+del asgd
+del adamax
+del adadelta
diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi
new file mode 100644
index 000000000000..952b969012b7
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.pyi
@@ -0,0 +1,8 @@
+from .adam import Adam as Adam
+from .adamw import AdamW as AdamW
+from .sgd import SGD as SGD
+from .rmsprop import RMSprop as RMSprop
+from .rprop import Rprop as Rprop
+from .asgd import ASGD as ASGD
+from .adamax import Adamax as Adamax
+from .adadelta import Adadelta as Adadelta
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
new file mode 100644
index 000000000000..e69c2143fd12
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.py
@@ -0,0 +1,99 @@
+import torch
+from ..optimizer import Optimizer
+
+class Adadelta(Optimizer):
+    """Implements Adadelta algorithm.
+
+    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1212.5701
+    """
+
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError("Invalid rho value: {}".format(rho))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        super(Adadelta, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            square_avgs = []
+            acc_deltas = []
+
+            rho, eps = group['rho'], group['eps']
+
+            for p in group['params']:
+                if p.grad is not None: 
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adadelta does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    square_avgs.append(state['square_avg'])
+                    acc_deltas.append(state['acc_delta'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avgs, rho)
+            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)
+
+            std = torch._foreach_add(square_avgs, eps)
+            torch._foreach_sqrt_(std)
+
+            deltas = torch._foreach_add(acc_deltas, eps)
+            torch._foreach_sqrt_(deltas)
+            torch._foreach_div_(deltas, std)
+            torch._foreach_mul_(deltas, grads)
+
+            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])
+
+            torch._foreach_mul_(acc_deltas, rho)
+            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adadelta.pyi b/torch/optim/_multi_tensor/adadelta.pyi
new file mode 100644
index 000000000000..0ca4478a16da
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adadelta(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
new file mode 100644
index 000000000000..0303024ff528
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.py
@@ -0,0 +1,142 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+class Adam(Optimizer):
+    r"""Implements Adam algorithm with multi tensor APIs.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    The implementation of the L2 penalty follows changes proposed in
+    `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+            if group['weight_decay'] != 0:
+                grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay'])
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                [torch.max(a, b, out=a) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi
new file mode 100644
index 000000000000..09f29597fd18
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adam(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamax.py b/torch/optim/_multi_tensor/adamax.py
new file mode 100644
index 000000000000..6eb86676315d
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.py
@@ -0,0 +1,107 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Adamax(Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        super(Adamax, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            exp_avgs = []
+            exp_infs = []
+
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adamax does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_infs.append(state['exp_inf'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            # Update biased first moment estimate.
+            torch._foreach_mul_(exp_avgs, beta1)
+            torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
+
+            # Update the exponentially weighted infinity norm.
+            torch._foreach_mul_(exp_infs, beta2)
+
+            for exp_inf, grad in zip(exp_infs, grads):
+                norm_buf = torch.cat([
+                    exp_inf.unsqueeze(0),
+                    grad.abs().add_(eps).unsqueeze_(0)
+                ], 0)
+                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+
+            bias_corrections = [1 - beta1 ** state['step'] for state in states]
+            clr = [group['lr'] / bias_correction for bias_correction in bias_corrections]
+
+            for i in range(len(params_with_grad)):
+                params_with_grad[i].addcdiv_(exp_avgs[i], exp_infs[i], value=-clr[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamax.pyi b/torch/optim/_multi_tensor/adamax.pyi
new file mode 100644
index 000000000000..4ac68f75ba99
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adamax(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
new file mode 100644
index 000000000000..a14a67a3cb3a
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -0,0 +1,144 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class AdamW(Optimizer):
+    r"""Implements AdamW algorithm.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('AdamW does not support sparse gradients')
+
+                    # Perform stepweight decay
+                    p.mul_(1 - group['lr'] * group['weight_decay'])
+
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                [torch.max(a, b, out=a) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi
new file mode 100644
index 000000000000..dedd8de3f876
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class AdamW(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/asgd.py b/torch/optim/_multi_tensor/asgd.py
new file mode 100644
index 000000000000..f49a4ab0e5b0
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.py
@@ -0,0 +1,94 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class ASGD(Optimizer):
+    """Implements Averaged Stochastic Gradient Descent.
+
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lambd (float, optional): decay term (default: 1e-4)
+        alpha (float, optional): power for eta update (default: 0.75)
+        t0 (float, optional): point at which to start averaging (default: 1e6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _Acceleration of stochastic approximation by averaging:
+        https://dl.acm.org/citation.cfm?id=131098
+    """
+
+    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
+                        weight_decay=weight_decay)
+        super(ASGD, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        params_with_grad = []
+        states = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('ASGD does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['eta'] = group['lr']
+                        state['mu'] = 1
+                        state['ax'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            # decay term
+            torch._foreach_mul_(params_with_grad, 1 - group['lambd'] * state['eta'])
+
+            # update parameter
+            torch._foreach_add_(params_with_grad, grads, alpha=-state['eta'])
+
+            # averaging
+            for i in range(len(states)):
+                if states[i]['mu'] != 1:
+                    states[i]['ax'].add_(params_with_grad[i].sub(states[i]['ax']).mul(states[i]['mu']))
+                else:
+                    states[i]['ax'].copy_(params_with_grad[i])
+
+            # update eta and mu
+            for state in states:
+                state['eta'] = (group['lr'] /
+                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
+                state['mu'] = 1 / max(1, state['step'] - group['t0'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/asgd.pyi b/torch/optim/_multi_tensor/asgd.pyi
new file mode 100644
index 000000000000..06e9149b72f5
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class ASGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
new file mode 100644
index 000000000000..10a422c23c37
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.py
@@ -0,0 +1,123 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class RMSprop(Optimizer):
+    r"""Implements RMSprop algorithm.
+
+    Proposed by G. Hinton in his
+    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+
+    The centered version first appears in `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+
+    The implementation here takes the square root of the gradient average before
+    adding epsilon (note that TensorFlow interchanges these two operations). The effective
+    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
+    is the scheduled learning rate and :math:`v` is the weighted moving average
+    of the squared gradient.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    """
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= alpha:
+            raise ValueError("Invalid alpha value: {}".format(alpha))
+
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        super(RMSprop, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RMSprop, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('momentum', 0)
+            group.setdefault('centered', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            alpha = group['alpha']
+            square_avg = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['momentum'] > 0:
+                            state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['centered']:
+                            state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    square_avg.append(state['square_avg'])
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avg, alpha)
+            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)
+
+            if group['centered']:
+                grad_avgs = [s['grad_avg'] for s in states]
+                torch._foreach_mul_(grad_avgs, alpha)
+                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
+                avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1)
+                torch._foreach_sqrt_(avg)
+                torch._foreach_add_(avg, group['eps'])
+            else:
+                avg = torch._foreach_sqrt(square_avg)
+                torch._foreach_add_(avg, group['eps'])
+
+            if group['momentum'] > 0:
+                buf = [s['momentum_buffer'] for s in states]
+                torch._foreach_mul_(buf, group['momentum'])
+                torch._foreach_addcdiv_(buf, grads, avg)
+                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
+            else:
+                torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi
new file mode 100644
index 000000000000..691f2188ebb1
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class RMSprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
new file mode 100644
index 000000000000..95df563a271f
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.py
@@ -0,0 +1,95 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Rprop(Optimizer):
+    """Implements the resilient backpropagation algorithm.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
+        step_sizes (Tuple[float, float], optional): a pair of minimal and
+            maximal allowed step sizes (default: (1e-6, 50))
+    """
+
+    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < etas[0] < 1.0 < etas[1]:
+            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
+
+        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
+        super(Rprop, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        states = []
+        params_with_grad = []
+        step_sizes = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                etaminus, etaplus = group['etas']
+                step_size_min, step_size_max = group['step_sizes']
+
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr'])
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    step_sizes.append(state['step_size'])
+
+            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
+            signs = [s.sign() for s in signs]
+            for sign in signs:
+                sign[sign.gt(0)] = etaplus
+                sign[sign.lt(0)] = etaminus
+                sign[sign.eq(0)] = 1
+
+            # update stepsizes with step size updates
+            torch._foreach_mul_(step_sizes, signs)
+            for step_size in step_sizes:
+                step_size.clamp_(step_size_min, step_size_max)
+
+            # for dir<0, dfdx=0
+            # for dir>=0 dfdx=dfdx
+            for i in range(len(grads)): 
+                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
+                grads[i][signs[i].eq(etaminus)] = 0
+
+            # update parameters
+            grad_signs = [grad.sign() for grad in grads]
+            torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1)
+
+            for i in range(len(states)):
+                states[i]['prev'].copy_(grads[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi
new file mode 100644
index 000000000000..0ea64c63d25e
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Rprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
new file mode 100644
index 000000000000..8219f771c4c1
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.py
@@ -0,0 +1,154 @@
+import torch
+from ..optimizer import Optimizer, required
+
+
+class SGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
+                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
+            \end{aligned}
+
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        parameters, gradient, velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
+                p_{t+1} & = p_{t} - v_{t+1}.
+            \end{aligned}
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(SGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(SGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+            grads = []
+            params_with_grad = []
+            states = []
+            has_sparse_grad = False
+
+            for p in group['params']:
+                if p.grad is not None:
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    states.append(self.state[p])
+
+                    if p.grad.is_sparse:
+                        has_sparse_grad = True
+
+                        if momentum != 0: 
+                            raise RuntimeError('SGD does not support momentum for sparse gradients')
+
+            if grads == []:
+                return loss
+
+            if weight_decay != 0:
+                grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay)
+
+            if momentum != 0:
+                bufs = []
+
+                all_states_with_momentum_buffer = True
+                for i in range(len(states)):
+                    if 'momentum_buffer' not in states[i]:
+                        all_states_with_momentum_buffer = False
+                        break
+                    else:
+                        bufs.append(states[i]['momentum_buffer'])
+
+                if all_states_with_momentum_buffer:
+                    torch._foreach_mul_(bufs, momentum)
+                    torch._foreach_add_(bufs, grads, alpha=1 - dampening)
+                else:
+                    bufs = []
+                    for i in range(len(states)):
+                        if 'momentum_buffer' not in states[i]:
+                            buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach()
+                        else:
+                            buf = states[i]['momentum_buffer']
+                            buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
+
+                        bufs.append(buf)
+
+                if nesterov:
+                    torch._foreach_add_(grads, bufs, alpha=momentum)
+                else:
+                    grads = bufs
+
+            if not has_sparse_grad:
+                torch._foreach_add_(params_with_grad, grads, alpha=-group['lr'])
+            else:
+                # foreach APIs dont support sparse
+                for i in range(len(params_with_grad)): 
+                    params_with_grad[i].add_(grads[i], alpha=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi
new file mode 100644
index 000000000000..6082e230cd79
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.pyi
@@ -0,0 +1,4 @@
+from ..optimizer import _params_t, Optimizer
+
+class SGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 761648537fc6..a4bffc0efbc6 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -1,4 +1,5 @@
 import torch
+from . import functional as F
 from .optimizer import Optimizer
 
 
@@ -63,40 +64,29 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-
-                grad = p.grad
-                state = self.state[p]
-
-                state['step'] += 1
+            params_with_grad = []
+            grads = []
+            state_sums = []
+            state_steps = []
 
-                if group['weight_decay'] != 0:
-                    if p.grad.is_sparse:
-                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
-                    grad = grad.add(p, alpha=group['weight_decay'])
-
-                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
-
-                if grad.is_sparse:
-                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
-                    grad_indices = grad._indices()
-                    grad_values = grad._values()
-                    size = grad.size()
-
-                    def make_sparse(values):
-                        constructor = grad.new
-                        if grad_indices.dim() == 0 or values.dim() == 0:
-                            return constructor().resize_as_(grad)
-                        return constructor(grad_indices, values, size)
-                    state['sum'].add_(make_sparse(grad_values.pow(2)))
-                    std = state['sum'].sparse_mask(grad)
-                    std_values = std._values().sqrt_().add_(group['eps'])
-                    p.add_(make_sparse(grad_values / std_values), alpha=-clr)
-                else:
-                    state['sum'].addcmul_(grad, grad, value=1)
-                    std = state['sum'].sqrt().add_(group['eps'])
-                    p.addcdiv_(grad, std, value=-clr)
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+                    state = self.state[p]
+                    state_sums.append(state['sum'])
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            F.adagrad(params_with_grad,
+                      grads,
+                      state_sums,
+                      state_steps,
+                      group['lr'],
+                      group['weight_decay'],
+                      group['lr_decay'],
+                      group['eps'])
 
         return loss
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 22a9e3828a57..c5ae8849aba7 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,5 +1,5 @@
-import math
 import torch
+from . import functional as F
 from .optimizer import Optimizer
 
 
@@ -66,52 +66,56 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                amsgrad = group['amsgrad']
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                if amsgrad:
-                    max_exp_avg_sq = state['max_exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                if group['weight_decay'] != 0:
-                    grad = grad.add(p, alpha=group['weight_decay'])
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                if amsgrad:
-                    # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
-                    # Use the max. for normalizing running avg. of gradient
-                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                else:
-                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-
-                step_size = group['lr'] / bias_correction1
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
 
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    grads.append(p.grad)
+
+                    state = self.state[p]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['amsgrad']:
+                            # Maintains max of all exp. moving avg. of sq. grad. values
+                            state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_avg_sqs.append(state['exp_avg_sq'])
+
+                    if group['amsgrad']:
+                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            beta1, beta2 = group['betas']
+            F.adam(params_with_grad,
+                   grads,
+                   exp_avgs,
+                   exp_avg_sqs,
+                   max_exp_avg_sqs,
+                   state_steps,
+                   group['amsgrad'],
+                   beta1,
+                   beta2,
+                   group['lr'],
+                   group['weight_decay'],
+                   group['eps']
+                   )
         return loss
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
new file mode 100644
index 000000000000..2984d7f417ea
--- /dev/null
+++ b/torch/optim/functional.py
@@ -0,0 +1,98 @@
+r"""Functional interface"""
+import math
+import torch
+from torch import Tensor
+from typing import List
+
+# TODO: use foreach API in optim.functional to do all the computation
+
+def _make_sparse(grad, grad_indices, values):
+    size = grad.size()
+    if grad_indices.numel() == 0 or values.numel() == 0:
+        return torch.empty_like(grad)
+    return torch.sparse_coo_tensor(grad_indices, values, size)
+
+
+def adagrad(params: List[Tensor],
+            grads: List[Tensor],
+            state_sums: List[Tensor],
+            state_steps: List[int],
+            lr: float,
+            weight_decay: float,
+            lr_decay: float,
+            eps: float):
+    r"""Functional API that performs Adagrad algorithm computation.
+
+    See :class:`~torch.optim.Adagrad` for details.
+    """
+
+    for (param, grad, state_sum, step) in zip(params, grads, state_sums, state_steps):
+        if weight_decay != 0:
+            if grad.is_sparse:
+                raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+            grad = grad.add(param, alpha=weight_decay)
+
+        clr = lr / (1 + (step - 1) * lr_decay)
+
+        if grad.is_sparse:
+            grad = grad.coalesce()  # the update is non-linear so indices must be unique
+            grad_indices = grad._indices()
+            grad_values = grad._values()
+            size = grad.size()
+
+            state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
+            std = state_sum.sparse_mask(grad)
+            std_values = std._values().sqrt_().add_(eps)
+            param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
+        else:
+            state_sum.addcmul_(grad, grad, value=1)
+            std = state_sum.sqrt().add_(eps)
+            param.addcdiv_(grad, std, value=-clr)
+
+
+def adam(params: List[Tensor],
+         grads: List[Tensor],
+         exp_avgs: List[Tensor],
+         exp_avg_sqs: List[Tensor],
+         max_exp_avg_sqs: List[Tensor],
+         state_steps: List[int],
+         amsgrad: bool,
+         beta1: float,
+         beta2: float,
+         lr: float,
+         weight_decay: float,
+         eps: float):
+    r"""Functional API that performs Adam algorithm computation.
+
+    See :class:`~torch.optim.Adam` for details.
+    """
+
+    for i, param in enumerate(params):
+
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step = state_steps[i]
+        if amsgrad:
+            max_exp_avg_sq = max_exp_avg_sqs[i]
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        if amsgrad:
+            # Maintains the maximum of all 2nd moment running avg. till now
+            torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+            # Use the max. for normalizing running avg. of gradient
+            denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+        else:
+            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+        step_size = lr / bias_correction1
+
+        param.addcdiv_(exp_avg, denom, value=-step_size)
diff --git a/torch/overrides.py b/torch/overrides.py
index 60f615bb1b0e..43efda1da862 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -156,6 +156,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic,
         torch.set_deterministic,
         torch.unify_type_list,
+        torch.valgrind_supported_platform,
+        torch.valgrind_toggle,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,
@@ -277,6 +279,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.cholesky: lambda input, upper=False, out=None: -1,
         torch.cholesky_inverse: lambda input, upper=False, out=None: -1,
         torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1,
+        torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1,
         torch.chunk: lambda input, chunks, dim=0: -1,
         torch.clamp: lambda input, min=None, max=None, out=None: -1,
         torch.clip: lambda input, min=None, max=None, out=None: -1,
@@ -487,6 +490,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.mv: lambda input, vec, out=None: -1,
         torch.mvlgamma: lambda input, p: -1,
         torch.narrow: lambda input, dim, start, length: -1,
+        torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1,
         torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
         torch.native_layer_norm: lambda input, weight, bias, M, N, eps: -1,
         torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
@@ -613,7 +617,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
         torch.nn.functional.selu: lambda input, inplace=False: -1,
         torch.nn.functional.silu: lambda input, inplace=False: -1,
-        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1,
         torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
         torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
         torch.nn.functional.softmin: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
@@ -624,6 +628,9 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1,
         torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
                                                   swap=False, size_average=None, reduce=None, reduction='mean': -1),
+        torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *,
+                                                                distance_function=None, margin=1.0,
+                                                                swap=False, reduction='mean': -1),
         torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1,
         torch.nonzero: lambda input, as_tuple=False: -1,
         torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
@@ -660,6 +667,8 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
 
         torch.quantized_lstm_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
                                     col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_max_pool1d: (lambda input, kernel_size, stride=tuple(), padding=(0,),
+                                     dilation=(1,), ceil_mode=False: -1),
         torch.quantized_max_pool2d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0),
                                      dilation=(1, 1), ceil_mode=False: -1),
         torch.quantized_rnn_relu_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
@@ -701,6 +710,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.sigmoid: lambda input, out=None: -1,
         torch.sign: lambda input, out=None: -1,
         torch.signbit: lambda input, out=None: -1,
+        torch.sgn: lambda input, out=None: -1,
         torch.sin: lambda input, out=None: -1,
         torch.sinh: lambda input, out=None: -1,
         torch.slogdet: lambda input: -1,
diff --git a/torch/package/__init__.py b/torch/package/__init__.py
new file mode 100644
index 000000000000..be7159a1836d
--- /dev/null
+++ b/torch/package/__init__.py
@@ -0,0 +1,2 @@
+from .importer import PackageImporter
+from .exporter import PackageExporter
diff --git a/torch/package/_custom_import_pickler.py b/torch/package/_custom_import_pickler.py
new file mode 100644
index 000000000000..fd5787b6b3e3
--- /dev/null
+++ b/torch/package/_custom_import_pickler.py
@@ -0,0 +1,78 @@
+from pickle import _Pickler, _getattribute, whichmodule, _extension_registry, _compat_pickle  # type: ignore
+from pickle import GLOBAL, STACK_GLOBAL, EXT1, EXT2, EXT4, PicklingError
+from struct import pack
+
+class CustomImportPickler(_Pickler):
+    def __init__(self, import_module, *args, **kwargs):
+        self.import_module = import_module
+        super().__init__(*args, **kwargs)
+
+    def save_global(self, obj, name=None):
+        # unfortunately the pickler code is factored in a way that
+        # forces us to copy/paste this function. The only change is marked
+        # CHANGED below.
+        write = self.write
+        memo = self.memo
+
+        if name is None:
+            name = getattr(obj, '__qualname__', None)
+        if name is None:
+            name = obj.__name__
+
+        module_name = whichmodule(obj, name)
+        try:
+            # CHANGED: self.import_module rather than
+            # __import__
+            module = self.import_module(module_name)
+            obj2, parent = _getattribute(module, name)
+        except (ImportError, KeyError, AttributeError):
+            raise PicklingError(
+                "Can't pickle %r: it's not found as %s.%s" %
+                (obj, module_name, name)) from None
+        else:
+            if obj2 is not obj:
+                raise PicklingError(
+                    "Can't pickle %r: it's not the same object as %s.%s" %
+                    (obj, module_name, name))
+
+        if self.proto >= 2:
+            code = _extension_registry.get((module_name, name))
+            if code:
+                assert code > 0
+                if code <= 0xff:
+                    write(EXT1 + pack("<B", code))
+                elif code <= 0xffff:
+                    write(EXT2 + pack("<H", code))
+                else:
+                    write(EXT4 + pack("<i", code))
+                return
+        lastname = name.rpartition('.')[2]
+        if parent is module:
+            name = lastname
+        # Non-ASCII identifiers are supported only with protocols >= 3.
+        if self.proto >= 4:
+            self.save(module_name)
+            self.save(name)
+            write(STACK_GLOBAL)
+        elif parent is not module:
+            self.save_reduce(getattr, (parent, lastname))
+        elif self.proto >= 3:
+            write(GLOBAL + bytes(module_name, "utf-8") + b'\n' +
+                  bytes(name, "utf-8") + b'\n')
+        else:
+            if self.fix_imports:
+                r_name_mapping = _compat_pickle.REVERSE_NAME_MAPPING
+                r_import_mapping = _compat_pickle.REVERSE_IMPORT_MAPPING
+                if (module_name, name) in r_name_mapping:
+                    module_name, name = r_name_mapping[(module_name, name)]
+                elif module_name in r_import_mapping:
+                    module_name = r_import_mapping[module_name]
+            try:
+                write(GLOBAL + bytes(module_name, "ascii") + b'\n' +
+                      bytes(name, "ascii") + b'\n')
+            except UnicodeEncodeError:
+                raise PicklingError(
+                    "can't pickle global identifier '%s.%s' using "
+                    "pickle protocol %i" % (module, name, self.proto)) from None
+
+        self.memoize(obj)
diff --git a/torch/package/_importlib.py b/torch/package/_importlib.py
new file mode 100644
index 000000000000..1b521ca1a962
--- /dev/null
+++ b/torch/package/_importlib.py
@@ -0,0 +1,83 @@
+import _warnings
+import os.path
+# note: implementations 
+# copied from cpython's import code
+
+
+# _zip_searchorder defines how we search for a module in the Zip
+# archive: we first search for a package __init__, then for
+# non-package .pyc, and .py entries. The .pyc entries
+# are swapped by initzipimport() if we run in optimized mode. Also,
+# '/' is replaced by path_sep there.
+
+_zip_searchorder = (
+    ('/__init__.py', True),
+    ('.py', False),
+)
+
+# Replace any occurrences of '\r\n?' in the input string with '\n'.
+# This converts DOS and Mac line endings to Unix line endings.
+def _normalize_line_endings(source):
+    source = source.replace(b'\r\n', b'\n')
+    source = source.replace(b'\r', b'\n')
+    return source
+
+def _resolve_name(name, package, level):
+    """Resolve a relative module name to an absolute one."""
+    bits = package.rsplit('.', level - 1)
+    if len(bits) < level:
+        raise ValueError('attempted relative import beyond top-level package')
+    base = bits[0]
+    return '{}.{}'.format(base, name) if name else base
+
+def _sanity_check(name, package, level):
+    """Verify arguments are "sane"."""
+    if not isinstance(name, str):
+        raise TypeError('module name must be str, not {}'.format(type(name)))
+    if level < 0:
+        raise ValueError('level must be >= 0')
+    if level > 0:
+        if not isinstance(package, str):
+            raise TypeError('__package__ not set to a string')
+        elif not package:
+            raise ImportError('attempted relative import with no known parent '
+                              'package')
+    if not name and level == 0:
+        raise ValueError('Empty module name')
+
+def _calc___package__(globals):
+    """Calculate what __package__ should be.
+
+    __package__ is not guaranteed to be defined or could be set to None
+    to represent that its proper value is unknown.
+
+    """
+    package = globals.get('__package__')
+    spec = globals.get('__spec__')
+    if package is not None:
+        if spec is not None and package != spec.parent:
+            _warnings.warn("__package__ != __spec__.parent "
+                           f"({package!r} != {spec.parent!r})",
+                           ImportWarning, stacklevel=3)
+        return package
+    elif spec is not None:
+        return spec.parent
+    else:
+        _warnings.warn("can't resolve package from __spec__ or __package__, "
+                       "falling back on __name__ and __path__",
+                       ImportWarning, stacklevel=3)
+        package = globals['__name__']
+        if '__path__' not in globals:
+            package = package.rpartition('.')[0]
+    return package
+
+def _normalize_path(path):
+    """Normalize a path by ensuring it is a string.
+
+    If the resulting string contains path separators, an exception is raised.
+    """
+    parent, file_name = os.path.split(path)
+    if parent:
+        raise ValueError('{!r} must be only a file name'.format(path))
+    else:
+        return file_name
diff --git a/torch/package/_mock.py b/torch/package/_mock.py
new file mode 100644
index 000000000000..d291bb58ba5e
--- /dev/null
+++ b/torch/package/_mock.py
@@ -0,0 +1,39 @@
+
+_magic_methods = ['__subclasscheck__', '__hex__', '__rmul__',
+                  '__float__', '__idiv__', '__setattr__', '__div__', '__invert__',
+                  '__nonzero__', '__rshift__',
+                  '__eq__', '__pos__', '__round__',
+                  '__rand__', '__or__', '__complex__', '__divmod__',
+                  '__len__', '__reversed__', '__copy__', '__reduce__',
+                  '__deepcopy__', '__rdivmod__', '__rrshift__', '__ifloordiv__',
+                  '__hash__', '__iand__', '__xor__', '__isub__', '__oct__',
+                  '__ceil__', '__imod__', '__add__', '__truediv__',
+                  '__unicode__', '__le__', '__delitem__', '__sizeof__', '__sub__',
+                  '__ne__', '__pow__', '__bytes__', '__mul__',
+                  '__itruediv__', '__bool__', '__iter__', '__abs__',
+                  '__gt__', '__iadd__', '__enter__',
+                  '__floordiv__', '__call__', '__neg__',
+                  '__and__', '__ixor__', '__getitem__', '__exit__', '__cmp__',
+                  '__getstate__', '__index__', '__contains__', '__floor__', '__lt__', '__getattr__',
+                  '__mod__', '__trunc__', '__delattr__', '__instancecheck__', '__setitem__', '__ipow__',
+                  '__ilshift__', '__long__', '__irshift__', '__imul__',
+                  '__lshift__', '__dir__', '__ge__', '__int__', '__ior__']
+
+
+class MockedObject:
+    _name: str
+
+    def __init__(self, name):
+        self.__dict__['_name'] = name
+
+    def __repr__(self):
+        return f"MockedObject({self._name})"
+
+
+def install_method(method_name):
+    def _not_implemented(self, *args, **kwargs):
+        raise NotImplementedError(f"Object '{self._name}' was mocked out during packaging but it is being used in {method_name}")
+    setattr(MockedObject, method_name, _not_implemented)
+
+for method_name in _magic_methods:
+    install_method(method_name)
diff --git a/torch/package/_mock_zipreader.py b/torch/package/_mock_zipreader.py
new file mode 100644
index 000000000000..b273d41fba51
--- /dev/null
+++ b/torch/package/_mock_zipreader.py
@@ -0,0 +1,48 @@
+import torch
+from glob import glob
+import os.path
+from typing import List, Any
+
+_storages : List[Any] = [
+    torch.DoubleStorage,
+    torch.FloatStorage,
+    torch.LongStorage,
+    torch.IntStorage,
+    torch.ShortStorage,
+    torch.CharStorage,
+    torch.ByteStorage,
+    torch.BoolStorage,
+]
+_dtype_to_storage = {
+    data_type(0).dtype: data_type for data_type in _storages
+}
+
+# because get_storage_from_record returns a tensor!?
+class _HasStorage(object):
+    def __init__(self, storage):
+        self._storage = storage
+
+    def storage(self):
+        return self._storage
+
+
+class MockZipReader(object):
+    def __init__(self, directory):
+        self.directory = directory
+
+    def get_record(self, name):
+        filename = f'{self.directory}/{name}'
+        with open(filename, 'rb') as f:
+            return f.read()
+
+    def get_storage_from_record(self, name, numel, dtype):
+        storage = _dtype_to_storage[dtype]
+        filename = f'{self.directory}/{name}'
+        return _HasStorage(storage.from_file(filename=filename, size=numel))
+
+    def get_all_records(self, ):
+        files = []
+        for filename in glob(f'{self.directory}/**', recursive=True):
+            if not os.path.isdir(filename):
+                files.append(filename[len(self.directory) + 1:])
+        return files
diff --git a/torch/package/exporter.py b/torch/package/exporter.py
new file mode 100644
index 000000000000..2055cf945334
--- /dev/null
+++ b/torch/package/exporter.py
@@ -0,0 +1,531 @@
+import torch
+from torch.serialization import normalize_storage_type, location_tag, _should_read_directly
+import io
+import pickle
+import pickletools
+from .find_file_dependencies import find_files_source_depends_on
+from ._custom_import_pickler import CustomImportPickler
+from ._importlib import _normalize_path
+import types
+import importlib
+from typing import List, Any, Callable, Dict, Tuple
+from distutils.sysconfig import get_python_lib
+from pathlib import Path
+import linecache
+import sys
+from tempfile import NamedTemporaryFile
+
+class PackageExporter:
+    """ Exporters allow you to write packages of code, pickled python data, and
+    arbitrary binary and text resources into a self-contained package.
+
+    Imports can load this code in a hermetic way, such that code is loaded
+    from the package rather than the normal python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The code contained in packages is copied file-by-file from the original
+    source when it is created, and the file format is a specially organized
+    zip file. Future users of the package can unzip the package, and edit the code
+    in order to perform custom modifications to it.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external using :method:`extern_module`.
+    The file `extern_modules` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+
+
+    Dependencies
+    ------------
+
+    When source code is added to the package, the exporter optionally can scan it
+    for further code dependencies (`dependencies=True`). It looks for import statements,
+    resolves relative references to qualified module names, and calls :method:`require_module`
+    on each it finds, recursively resolving dependencies.
+
+    """
+
+    importers: List[Callable[[str], Any]]
+    """ A list of functions that will be called in order to find the module assocated
+    with module names referenced by other modules or by pickled objects. Initialized to
+    `[importlib.import_module]` by default. When pickling code or objects that was loaded
+    from an imported packaged, that `importer.import_module` should be put into the importer list.
+    When a name conflict occurs between importers, the first importer in the list takes precedence,
+    and only objects that refer to this first importers class can be saved
+    """
+
+
+    def __init__(self, filename: str, verbose: bool = True):
+        """
+        Create an exporter.
+
+        Args:
+            filename: e.g. my_package.zip
+            verbose: Print information about dependency resolution to stdout.
+                Useful for tracking down why certain files get included.
+        """
+        self.zip_file = torch._C.PyTorchFileWriter(filename)
+        self.serialized_storages : Dict[str, Any] = {}
+        self.external : List[str] = []
+        self.provided : Dict[str, bool] = {}
+        self.verbose = verbose
+        self.importers = [importlib.import_module]
+        self.debug_deps : List[Tuple[str, str]] = []
+
+    def save_source_file(self, module_name: str, file_or_directory: str, dependencies=True):
+        """Adds the local file system `file_or_directory` to the source package to provide the code
+        for `module_name`.
+
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            file_or_directory (str): the path to a file or directory of code. When a directory, all python files in the directory
+                are recursively copied using :meth:`save_source_file`. If a file is named "/__init__.py" the code is treated
+                as a package.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        path = Path(file_or_directory)
+        if path.is_dir():
+            to_save = []  # list of tuples with arguments to save_source_string
+            module_path = module_name.replace('.', '/')
+            for filename in path.glob('**/*.py'):
+                relative_path = filename.relative_to(path).as_posix()
+                archivename = module_path + '/' + relative_path
+                if filename.is_dir():
+                    self.provided[archivename] = True
+                else:
+                    submodule_name = None
+                    if filename.name == '__init__.py':
+                        submodule_name = archivename[:-len('/__init__.py')].replace('/', '.')
+                        is_package = True
+                    else:
+                        submodule_name = archivename[:-len('.py')].replace('/', '.')
+                        is_package = False
+
+                    self.provided[submodule_name] = True
+                    # we delay the call to save_source_string so that we record all the source files
+                    # being provided by this directory structure _before_ attempting to resolve the dependencies
+                    # on the source. This makes sure we don't try to copy over modules that will just get
+                    # overwritten by this directory blob
+                    to_save.append((submodule_name, _read_file(str(filename)), is_package, dependencies, str(filename)))
+
+            for item in to_save:
+                self.save_source_string(*item)
+        else:
+            is_package = path.name == '__init__.py'
+            self.save_source_string(module_name, _read_file(file_or_directory), is_package, dependencies, file_or_directory)
+
+    def save_source_string(self, module_name: str, src: str, is_package: bool = False,
+                           dependencies: bool = True, orig_file_name: str = None):
+        """Adds `src` as the source code for `module_name` in the exported package.
+
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            src (str): The python source code to save for this package
+            is_package (bool, optional): If True, this module is treated as a package. Packages are allowed to have submodules
+                (e.g. my_package.my_subpackage.my_subsubpackage), and resources can be saved inside them. Defaults to False.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+            orig_file_name (str, optional): If present, used in logging to identifying where the source came from. Defaults to None.
+        """
+        self.provided[module_name] = True
+        extension = '/__init__.py' if is_package else '.py'
+        filename = module_name.replace('.', '/') + extension
+        self._write(filename, src)
+        if dependencies:
+            package = module_name if is_package else module_name.rsplit('.', maxsplit=1)[0]
+            dep_pairs = find_files_source_depends_on(src, package)
+            dep_list = {}
+            for dep_module_name, dep_module_obj in dep_pairs:
+                # handle the case where someone did something like `from pack import sub`
+                # where `sub` is a submodule. In this case we don't have to save pack, just sub.
+                # this ensures we don't pick up additional dependencies on pack.
+                # However, in the case where `sub` is not a submodule but an object, then we do have
+                # to save pack.
+                if dep_module_obj is not None:
+                    possible_submodule = f'{dep_module_name}.{dep_module_obj}'
+                    if self._module_exists(possible_submodule):
+                        dep_list[possible_submodule] = True
+                        # we don't need to save `pack`
+                        continue
+                if self._module_exists(dep_module_name):
+                    dep_list[dep_module_name] = True
+
+            for dep in dep_list.keys():
+                self.debug_deps.append((module_name, dep))
+
+            if self.verbose:
+                dep_str = ''.join(f'  {dep}\n' for dep in dep_list.keys())
+                file_info = f'(from file {orig_file_name}) ' if orig_file_name is not None else ''
+                print(f"{module_name} {file_info}depends on:\n{dep_str}\n")
+
+            for dep in dep_list.keys():
+                self.require_module_if_not_provided(dep)
+
+    def _module_exists(self, module_name: str) -> bool:
+        try:
+            self._import_module(module_name)
+            return True
+        except Exception:
+            return False
+
+    def _write_dep_graph(self, failing_module=None, output_file=None):
+        depended_on : Dict[str, List[str]] = {}
+        for f, t in self.debug_deps:
+            if t not in depended_on:
+                depended_on[t] = []
+            if f not in depended_on:
+                depended_on[f] = []
+            depended_on[t].append(f)
+
+        level : Dict[str, int] = {}
+
+        def visit(x: str):
+            if x in level:
+                return level[x]
+            level[x] = 0
+            for e in depended_on[x]:
+                level[x] = max(level[x], visit(e) + 1)
+            return level[x]
+
+        for x in depended_on.keys():
+            visit(x)
+
+        nodes = []
+        node_to_id = {}
+        n = 0
+        for ft in self.debug_deps:
+            for e in ft:
+                if e not in node_to_id:
+                    node_to_id[e] = n
+                    extra = ''
+                    if e == failing_module:
+                        extra = ", color: 'red'"
+                    nodes.append(f"        {{id: {n}, label: '{e}', level: {level[e]}, shape: 'box'{extra}}},\n")
+                    n += 1
+        edges = []
+        for f, t in self.debug_deps:
+            fn, tn = node_to_id[f], node_to_id[t]
+            edges.append(f"            {{from: {fn}, to: {tn}, arrows: 'to'}},\n")
+        nodes_s, edges_s = ''.join(nodes), ''.join(edges)
+        template = f"""\
+<html>
+<head>
+    <script type="text/javascript" src="https://almende.github.io/vis/dist/vis.js"></script>
+    <link href="https://almende.github.io/vis/dist/vis.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="mynetwork"></div>
+
+<script type="text/javascript">
+    var nodes = new vis.DataSet([
+{nodes_s}
+    ]);
+    var edges = new vis.DataSet([
+{edges_s}
+    ]);
+    var options = {{
+        layout: {{
+            hierarchical: {{
+                direction: "LR",
+                levelSeparation: 400,
+            }},
+        }},
+    }};
+    // create a network
+    var container = document.getElementById('mynetwork');
+    var network = new vis.Network(container, {{nodes: nodes, edges: edges}}, options);
+</script>
+</body>
+</html>
+"""
+        if output_file:
+            output_file.write(template)
+            return None
+
+        with NamedTemporaryFile(mode='w', suffix='.html', delete=False) as tf:
+            tf.write(template)
+            return tf.name
+
+    def _get_source_of_module(self, module: types.ModuleType) -> str:
+        filename = getattr(module, '__file__', None)
+        result = None if filename is None or not filename.endswith('.py') else linecache.getlines(filename, module.__dict__)
+        if result is None:
+            extra = ''
+            if self.verbose:
+                extra = f' See the dependency graph for more info: {self._write_dep_graph(module.__name__)}'
+            raise ValueError(f'cannot save source for module "{module.__name__}" because '
+                             f'its source file "{filename}" could not be found.{extra}')
+        return ''.join(result)
+
+    def require_module_if_not_provided(self, module_name: str, dependencies=True):
+        if self._module_is_already_provided(module_name):
+            return
+        self.require_module(module_name, dependencies)
+
+    def require_module(self, module_name: str, dependencies=True):
+        """This is called by dependencies resolution when it finds that something in the package
+        depends on the module and it is not already present. It then decides how to provide that module.
+        The default resolution rules will mark the module as extern if it is part of the standard library,
+        and call `save_module` otherwise. Clients can subclass this object
+        and override this method to provide other behavior, such as automatically mocking out a whole class
+        of modules"""
+
+        root_name = module_name.split('.', maxsplit=1)[0]
+        if self._can_implicitly_extern(root_name):
+            if self.verbose:
+                print(f'implicitly adding {root_name} to external modules '
+                      f'since it is part of the standard library and is a dependency.')
+            self.extern_module(root_name)
+            return
+
+        self.save_module(module_name, dependencies)
+
+    def save_module(self, module_name: str, dependencies=True):
+        """Save the code for `module_name` into the package. Code for the module is resolved using the `importers` path to find the
+        module object, and then using its `__file__` attribute to find the source code.
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        module = self._import_module(module_name)
+        source = self._get_source_of_module(module)
+        self.save_source_string(module_name, source, hasattr(module, '__path__'), dependencies, module.__file__)
+
+
+    def _import_module(self, module_name):
+        last_err = None
+        for import_module in self.importers:
+            try:
+                return import_module(module_name)
+            except ModuleNotFoundError as err:
+                last_err = err
+
+        if last_err is not None:
+            raise last_err
+        else:
+            raise ModuleNotFoundError(module_name)
+
+    def _create_pickler(self, data_buf):
+        if self.importers == [importlib.import_module]:
+            # if we are using the normal import library system, then
+            # we can use the C implementation of pickle which is faster
+            return pickle.Pickler(data_buf, protocol=3)
+        else:
+            return CustomImportPickler(self._import_module, data_buf, protocol=3)
+
+    def save_pickle(self, package: str, resource: str, obj: Any, dependencies: bool = True):
+        """Save a python object to the archive using pickle. Equivalent to :func:`torch.save` but saving into
+        the archive rather than a stand-alone file. Stanard pickle does not save the code, only the objects.
+        If `dependencies` is true, this method will also scan the pickled objects for which modules are required
+        to reconstruct them and save the relevant code.
+
+        To be able to save an object where `type(obj).__name__` is `my_module.MyObject`,
+        `my_module.MyObject` must resolve to the class of the object according to the `importer` order. When saving objects that
+        have previously been packaged, the importer's `import_module` method will need to be present in the `importer` list
+        for this to work.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            obj (Any): The object to save, must be picklable.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        filename = self._filename(package, resource)
+        # Write the pickle data for `obj`
+        data_buf = io.BytesIO()
+        pickler = self._create_pickler(data_buf)
+        pickler.persistent_id = self._persistent_id
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+
+        if dependencies:
+            all_dependencies = []
+            for opcode, arg, pos in pickletools.genops(data_value):
+                if opcode.name == 'GLOBAL':  # a global reference
+                    assert isinstance(arg, str)
+                    module, field = arg.split(' ')
+                    if module not in all_dependencies:
+                        all_dependencies.append(module)
+
+            for dep in all_dependencies:
+                self.debug_deps.append((package + '.' + resource, dep))
+
+            if self.verbose:
+                dep_string = ''.join(f'  {dep}\n' for dep in all_dependencies)
+                print(f"{resource} depends on:\n{dep_string}\n")
+
+            for module_name in all_dependencies:
+                self.require_module_if_not_provided(module_name)
+
+        self._write(filename, data_value)
+
+    def save_text(self, package: str, resource: str, text: str):
+        """Save text data to the package
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            text (str): The contents to save
+        """
+        return self.save_binary(package, resource, text.encode('utf-8'))
+
+    def save_binary(self, package, resource, binary: bytes):
+        """Save raw bytes to the package.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            binary (str): The data to save.
+        """
+        filename = self._filename(package, resource)
+        self._write(filename, binary)
+
+    def extern_module(self, module_name: str):
+        """Include `module` in the list of external modules the package can import.
+        This will prevent dependency discover from saving
+        it in the package. The importer will load an external module directly from the standard import system.
+        Code for extern modules must also exist in the process loading the package.
+
+        Args:
+            module_name (str): e.g. "my_package.my_subpackage" the name of the external module
+        """
+        if module_name not in self.external:
+            self.external.append(module_name)
+
+    def extern_modules(self, module_names: List[str]):
+        """Extern a list of modules. Convience wrapper for calling :meth:`extern_module` on many items.
+
+        Args:
+            module_names (List[str]): List of module names
+        """
+        for m in module_names:
+            self.extern_module(m)
+
+    def mock_module(self, module_name: str):
+        """Replace the code for `module_name` in the package with a fake implementation. This module will return a fake
+        object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes
+        find files that are imported by model files but whose functionality is never used
+        (e.g. custom serialization code or training helpers).
+        Use this function to mock this functionality out without having to modify the original code.
+
+        Args:
+            module_name (str): e.g. "my_package.my_subpackage" the name of the module to be mocked out.
+        """
+        if '_mock' not in self.provided:
+            self.save_source_file('_mock', str(Path(__file__).parent / '_mock.py'), dependencies=False)
+        is_package = hasattr(self._import_module(module_name), '__path__')
+        self.save_source_string(module_name, _MOCK_IMPL, is_package, dependencies=False)
+
+
+    def mock_modules(self, module_names):
+        """Mock a list of modules. Convience wrapper for calling :meth:`mock_module` on many items.
+
+        Args:
+            module_names (List[str]): List of module names
+        """
+        for module_name in module_names:
+            self.mock_module(module_name)
+
+    def _module_is_already_provided(self, qualified_name: str) -> bool:
+        for mod in self.external:
+            if qualified_name == mod or qualified_name.startswith(mod + '.'):
+                return True
+        return qualified_name in self.provided
+
+    def _persistent_id(self, obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if torch.is_storage(obj):
+            storage_type = normalize_storage_type(type(obj))
+            obj_key = str(obj._cdata)
+            location = location_tag(obj)
+            self.serialized_storages[obj_key] = obj
+
+            return ('storage',
+                    storage_type,
+                    obj_key,
+                    location,
+                    obj.size())
+        return None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def _write(self, filename, str_or_bytes):
+        if isinstance(str_or_bytes, str):
+            str_or_bytes = str_or_bytes.encode('utf-8')
+        self.zip_file.write_record(filename, str_or_bytes, len(str_or_bytes))
+
+    def close(self):
+        """Write the package to the filesystem. Any calls after close are now invalid.
+        It is preferable to use resource guard syntax instead:
+
+            with PackageExporter("file.zip") as e:
+                ...
+        """
+        if self.verbose:
+            print(f"Dependency graph for exported package: {self._write_dep_graph()}")
+
+        # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+        for key in sorted(self.serialized_storages.keys()):
+            name = 'data/{}'.format(key)
+            storage = self.serialized_storages[key]
+            if storage.device.type == 'cpu':
+                # If it's on the CPU we can directly copy it into the zip file
+                num_bytes = storage.size() * storage.element_size()
+                self.zip_file.write_record(name, storage.data_ptr(), num_bytes)
+            else:
+                # Copy to a buffer, then serialize that
+                buf = io.BytesIO()
+                storage._write_file(buf, _should_read_directly(buf))
+                buf_value = buf.getvalue()
+                self._write(name, buf_value)
+        contents = ('\n'.join(self.external) + '\n')
+        self._write('extern_modules', contents)
+        del self.zip_file
+
+
+    def _filename(self, package, resource):
+        package_path = package.replace('.', '/')
+        resource = _normalize_path(resource)
+        return f'{package_path}/{resource}'
+
+    def _can_implicitly_extern(self, module_name: str):
+        return module_name == 'torch' or (module_name not in _DISALLOWED_MODULES
+                                          and _is_builtin_or_stdlib_module(self._import_module(module_name)))
+
+
+# even though these are in the standard library, we do not allow them to be
+# automatically externed since they offer a lot of system level access
+_DISALLOWED_MODULES = ['sys', 'io']
+
+def _is_builtin_or_stdlib_module(module: types.ModuleType) -> bool:
+    if module.__name__ in sys.builtin_module_names:
+        return True
+    filename = getattr(module, '__file__', None)
+    if filename is None:
+        return False
+    standard_lib = get_python_lib(standard_lib=True)
+    # this is often a subdirectory of standard_lib so we have to check
+    # that the file is in the standard_lib directory but not in this one
+    installed_libs = get_python_lib(standard_lib=False)
+    in_standard_lib = filename.startswith(standard_lib + '/')
+    in_installed_libs = filename.startswith(installed_libs + '/')
+    return in_standard_lib and not in_installed_libs
+
+_MOCK_IMPL = """\
+from _mock import MockedObject
+def __getattr__(attr: str):
+    return MockedObject(__name__ + '.' + attr)
+"""
+
+def _read_file(filename: str) -> str:
+    with open(filename, 'rb') as f:
+        b = f.read()
+        return b.decode('utf-8')
diff --git a/torch/package/find_file_dependencies.py b/torch/package/find_file_dependencies.py
new file mode 100644
index 000000000000..25b501e37adc
--- /dev/null
+++ b/torch/package/find_file_dependencies.py
@@ -0,0 +1,42 @@
+from typing import List, Optional, Tuple
+import ast
+from ._importlib import _resolve_name
+
+class _ExtractModuleReferences(ast.NodeVisitor):
+    """
+    Extract the list of global variables a block of code will read and write
+    """
+
+    @classmethod
+    def run(cls, src: str, package: str) -> List[Tuple[str, Optional[str]]]:
+        visitor = cls(package)
+        tree = ast.parse(src)
+        visitor.visit(tree)
+        return list(visitor.references.keys())
+
+    def __init__(self, package):
+        super().__init__()
+        self.package = package
+        self.references = {}
+
+    def _absmodule(self, module_name: str, level: int) -> str:
+        if level > 0:
+            return _resolve_name(module_name, self.package, level)
+        return module_name
+
+    def visit_Import(self, node):
+        for alias in node.names:
+            self.references[(alias.name, None)] = True
+
+    def visit_ImportFrom(self, node):
+        name = self._absmodule(node.module, 0 if node.level is None else node.level)
+        for alias in node.names:
+            # from my_package import foo
+            # foo may be a module, so we have to add it to the list of
+            # potential references, if import of it fails, we will ignore it
+            if alias.name != '*':
+                self.references[(name, alias.name)] = True
+            else:
+                self.references[(name, None)] = True
+
+find_files_source_depends_on = _ExtractModuleReferences.run
diff --git a/torch/package/importer.py b/torch/package/importer.py
new file mode 100644
index 000000000000..1a02e69436fa
--- /dev/null
+++ b/torch/package/importer.py
@@ -0,0 +1,388 @@
+from typing import List, Callable, Dict, Optional, Any, Union
+import builtins
+import importlib
+from torch.serialization import _load
+import pickle
+import torch
+import _compat_pickle  # type: ignore
+import types
+import os.path
+
+from ._importlib import _normalize_line_endings, _resolve_name, _sanity_check, _calc___package__, \
+    _normalize_path
+from ._mock_zipreader import MockZipReader
+
+class PackageImporter:
+    """Importers allow you to load code written to packages by PackageExporter.
+    Code is loaded in a hermetic way, using files from the package
+    rather than the normal python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external during export.
+    The file `extern_modules` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+    """
+
+    modules : Dict[str, Optional[types.ModuleType]]
+    """The dictionary of already loaded modules from this package, equivalent to `sys.modules` but
+    local to this importer.
+    """
+
+    def __init__(self, filename: str, module_allowed: Callable[[str], bool] = lambda module_name: True):
+        """Open `filename` for importing. This checks that the imported package only requires modules
+        allowed by `module_allowed`
+
+        Args:
+            filename (str): archive to load. Can also be a directory of the unzipped files in the archive
+                for easy debugging and editing.
+            module_allowed (Callable[[str], bool], optional): A method to determine if a externally provided module
+                should be allowed. Can be used to ensure packages loaded do not depend on modules that the server
+                does not support. Defaults to allowing anything.
+
+        Raises:
+            ImportError: If the package will use a disallowed module.
+        """
+        self.filename = filename
+        self.zip_reader : Any
+        if not os.path.isdir(self.filename):
+            self.zip_reader = torch._C.PyTorchFileReader(self.filename)
+        else:
+            self.zip_reader = MockZipReader(self.filename)
+
+        self.root = _PackageNode(None)
+        self.modules = {}
+        self.extern_modules = self._read_extern()
+
+        for extern_module in self.extern_modules:
+            if not module_allowed(extern_module):
+                raise ImportError(f"package '{filename}' needs the external module '{extern_module}' "
+                                  f"but that module has been disallowed")
+            self._add_extern(extern_module)
+
+        for filename in self.zip_reader.get_all_records():
+            self._add_file(filename)
+
+        self.patched_builtins = builtins.__dict__.copy()
+        self.patched_builtins['__import__'] = self.__import__
+        # allow pickles from archive using `import resources`
+        self.modules['resources'] = self  # type: ignore
+
+        # used for torch.serialization._load
+        self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs)
+
+    def import_module(self, name: str, package=None):
+        """Load a module from the package if it hasn't already been loaded, and then return
+        the module. Modules are loaded locally
+        to the importer and will appear in `self.modules` rather than `sys.modules`
+
+        Args:
+            name (str): Fully qualified name of the module to load.
+            package ([type], optional): Unused, but present to match the signature of importlib.import_module. Defaults to None.
+
+        Returns:
+            types.ModuleType: the (possibly already) loaded module.
+        """
+        return self._gcd_import(name)
+
+    def load_binary(self, package: str, resource: str) -> bytes:
+        """Load raw bytes.
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+
+        Returns:
+            bytes: The loaded data.
+        """
+
+        path = self._zipfile_path(package, resource)
+        return self.zip_reader.get_record(path)
+
+    def load_text(self, package: str, resource: str, encoding: str = 'utf-8', errors: str = 'strict') -> str:
+        """Load a string.
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+            encoding (str, optional): Passed to `decode`. Defaults to 'utf-8'.
+            errors (str, optional): Passed to `decode`. Defaults to 'strict'.
+
+        Returns:
+            str: The loaded text.
+        """
+        data = self.load_binary(package, resource)
+        return data.decode(encoding, errors)
+
+    def load_pickle(self, package: str, resource: str, map_location=None) -> Any:
+        """Unpickles the resource from the package, loading any modules that are needed to construct the objects
+        using :meth:`import_module`
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+            map_location: Passed to `torch.load` to determine how tensors are mapped to devices. Defaults to None.
+
+        Returns:
+            Any: the unpickled object.
+        """
+        pickle_file = self._zipfile_path(package, resource)
+        return _load(self.zip_reader, map_location, self, pickle_file=pickle_file)
+
+
+    def _read_extern(self):
+        return self.zip_reader.get_record('extern_modules').decode('utf-8').splitlines(keepends=False)
+
+    def _make_module(self, name: str, filename: Optional[str], is_package: bool):
+        spec = importlib.machinery.ModuleSpec(name, self, is_package=is_package)  # type: ignore
+        module = importlib.util.module_from_spec(spec)
+        self.modules[name] = module
+        ns = module.__dict__
+        ns['__spec__'] = spec
+        ns['__loader__'] = self
+        ns['__file__'] = filename
+        ns['__cached__'] = None
+        ns['__builtins__'] = self.patched_builtins
+        if filename is not None:
+            code = self._compile_source(filename)
+            exec(code, ns)
+        return module
+
+    def _load_module(self, name: str):
+        cur : _PathNode = self.root
+        for atom in name.split('.'):
+            if not isinstance(cur, _PackageNode) or atom not in cur.children:
+                raise ModuleNotFoundError(
+                    f'No module named "{name}" in self-contained archive "{self.filename}"'
+                    f' and the module is also not in the list of allowed external modules: {self.extern_modules}')
+            cur = cur.children[atom]
+            if isinstance(cur, _ExternNode):
+                module = self.modules[name] = importlib.import_module(name)
+                return module
+        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode))  # type: ignore
+
+    def _compile_source(self, fullpath):
+        source = self.zip_reader.get_record(fullpath)
+        source = _normalize_line_endings(source)
+        return compile(source, fullpath, 'exec', dont_inherit=True)
+
+    # note: named `get_source` so that linecache can find the source
+    # when this is the __loader__ of a module.
+    def get_source(self, module_name) -> str:
+        module = self.import_module(module_name)
+        return self.zip_reader.get_record(module.__file__).decode('utf-8')
+
+    # note: copied from cpython's import code, with call to create module replaced with _make_module
+    def _do_find_and_load(self, name):
+        path = None
+        parent = name.rpartition('.')[0]
+        if parent:
+            if parent not in self.modules:
+                self._gcd_import(parent)
+            # Crazy side-effects!
+            if name in self.modules:
+                return self.modules[name]
+            parent_module = self.modules[parent]
+            try:
+                path = parent_module.__path__  # type: ignore
+            except AttributeError:
+                msg = (_ERR_MSG + '; {!r} is not a package').format(name, parent)
+                raise ModuleNotFoundError(msg, name=name) from None
+
+        module = self._load_module(name)
+
+        if parent:
+            # Set the module as an attribute on its parent.
+            parent_module = self.modules[parent]
+            if parent_module.__loader__ is self:  # type: ignore
+                setattr(parent_module, name.rpartition('.')[2], module)
+        return module
+
+    # note: copied from cpython's import code
+    def _find_and_load(self, name):
+        module = self.modules.get(name, _NEEDS_LOADING)
+        if module is _NEEDS_LOADING:
+            return self._do_find_and_load(name)
+
+        if module is None:
+            message = ('import of {} halted; '
+                       'None in sys.modules'.format(name))
+            raise ModuleNotFoundError(message, name=name)
+
+        return module
+
+
+    def _gcd_import(self, name, package=None, level=0):
+        """Import and return the module based on its name, the package the call is
+        being made from, and the level adjustment.
+
+        This function represents the greatest common denominator of functionality
+        between import_module and __import__. This includes setting __package__ if
+        the loader did not.
+
+        """
+        _sanity_check(name, package, level)
+        if level > 0:
+            name = _resolve_name(name, package, level)
+
+        return self._find_and_load(name)
+
+    # note: copied from cpython's import code
+    def _handle_fromlist(self, module, fromlist, *, recursive=False):
+        """Figure out what __import__ should return.
+
+        The import_ parameter is a callable which takes the name of module to
+        import. It is required to decouple the function from assuming importlib's
+        import implementation is desired.
+
+        """
+        # The hell that is fromlist ...
+        # If a package was imported, try to import stuff from fromlist.
+        if hasattr(module, '__path__'):
+            for x in fromlist:
+                if not isinstance(x, str):
+                    if recursive:
+                        where = module.__name__ + '.__all__'
+                    else:
+                        where = "``from list''"
+                    raise TypeError(f"Item in {where} must be str, "
+                                    f"not {type(x).__name__}")
+                elif x == '*':
+                    if not recursive and hasattr(module, '__all__'):
+                        self._handle_fromlist(module, module.__all__,
+                                              recursive=True)
+                elif not hasattr(module, x):
+                    from_name = '{}.{}'.format(module.__name__, x)
+                    try:
+                        self._gcd_import(from_name)
+                    except ModuleNotFoundError as exc:
+                        # Backwards-compatibility dictates we ignore failed
+                        # imports triggered by fromlist for modules that don't
+                        # exist.
+                        if (exc.name == from_name and
+                           self.modules.get(from_name, _NEEDS_LOADING) is not None):
+                            continue
+                        raise
+        return module
+
+    def __import__(self, name, globals=None, locals=None, fromlist=(), level=0):
+        if level == 0:
+            module = self._gcd_import(name)
+        else:
+            globals_ = globals if globals is not None else {}
+            package = _calc___package__(globals_)
+            module = self._gcd_import(name, package, level)
+        if not fromlist:
+            # Return up to the first dot in 'name'. This is complicated by the fact
+            # that 'name' may be relative.
+            if level == 0:
+                return self._gcd_import(name.partition('.')[0])
+            elif not name:
+                return module
+            else:
+                # Figure out where to slice the module's name up to the first dot
+                # in 'name'.
+                cut_off = len(name) - len(name.partition('.')[0])
+                # Slice end needs to be positive to alleviate need to special-case
+                # when ``'.' not in name``.
+                return self.modules[module.__name__[:len(module.__name__) - cut_off]]
+        else:
+            return self._handle_fromlist(module, fromlist)
+
+    def _get_package(self, package):
+        """Take a package name or module object and return the module.
+
+        If a name, the module is imported.  If the passed or imported module
+        object is not a package, raise an exception.
+        """
+        if hasattr(package, '__spec__'):
+            if package.__spec__.submodule_search_locations is None:
+                raise TypeError('{!r} is not a package'.format(
+                    package.__spec__.name))
+            else:
+                return package
+        else:
+            module = self.import_module(package)
+            if module.__spec__.submodule_search_locations is None:
+                raise TypeError('{!r} is not a package'.format(package))
+            else:
+                return module
+
+    def _zipfile_path(self, package, resource):
+        package = self._get_package(package)
+        resource = _normalize_path(resource)
+        assert package.__loader__ is self
+        return f"{package.__name__.replace('.', '/')}/{resource}"
+
+    def _get_or_create_package(self, atoms: List[str]) -> 'Union[_PackageNode, _ExternNode]':
+        cur = self.root
+        for i, atom in enumerate(atoms):
+            node = cur.children.get(atom, None)
+            if node is None:
+                node = cur.children[atom] = _PackageNode(None)
+            if isinstance(node, _ExternNode):
+                return node
+            if isinstance(node, _ModuleNode):
+                name = ".".join(atoms[:i])
+                raise ImportError(f'inconsistent module structure. module {name} is not a package, but has submodules')
+            assert isinstance(node, _PackageNode)
+            cur = node
+        return cur
+
+    def _add_file(self, filename: str):
+        *prefix, last = filename.split('/')
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            raise ImportError(f'inconsistent module structure. package contains a module file {filename}'
+                              f' that is a subpackage of a module marked external.')
+        if last == '__init__.py':
+            package.source_file = filename
+        elif last.endswith('.py'):
+            package_name = last[:-len('.py')]
+            package.children[package_name] = _ModuleNode(filename)
+
+    def _add_extern(self, extern_name: str):
+        *prefix, last = extern_name.split('.')
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            return  # the shorter extern covers this extern case
+        package.children[last] = _ExternNode()
+
+
+_NEEDS_LOADING = object()
+_ERR_MSG_PREFIX = 'No module named '
+_ERR_MSG = _ERR_MSG_PREFIX + '{!r}'
+
+class _UnpicklerWrapper(pickle._Unpickler):  # type: ignore
+    def __init__(self, importer, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._importer = importer
+
+    def find_class(self, module, name):
+        # Subclasses may override this.
+        if self.proto < 3 and self.fix_imports:
+            if (module, name) in _compat_pickle.NAME_MAPPING:
+                module, name = _compat_pickle.NAME_MAPPING[(module, name)]
+            elif module in _compat_pickle.IMPORT_MAPPING:
+                module = _compat_pickle.IMPORT_MAPPING[module]
+        mod = self._importer.import_module(module)
+        return getattr(mod, name)
+
+class _PathNode:
+    pass
+
+class _PackageNode(_PathNode):
+    def __init__(self, source_file: Optional[str]):
+        self.source_file = source_file
+        self.children : Dict[str, _PathNode] = {}
+
+class _ModuleNode(_PathNode):
+    __slots__ = ['source_file']
+
+    def __init__(self, source_file: str):
+        self.source_file = source_file
+
+class _ExternNode(_PathNode):
+    pass
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index ed908ddf85c3..31943e56e6a3 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -9,6 +9,7 @@
 from .quantize_fx import *
 from .quantization_mappings import *
 from .fuser_method_mappings import *
+from .custom_module_class_mappings import *
 
 def default_eval_fn(model, calib_data):
     r"""
@@ -40,6 +41,12 @@ def default_eval_fn(model, calib_data):
     'get_compare_output_module_list',
     'register_quantized_operator_mapping', 'get_quantized_operator',
     'register_fuser_method', 'get_fuser_method',
+    'register_observed_custom_module_mapping',
+    'get_observed_custom_module_class',
+    'register_quantized_custom_mdoule_mapping',
+    'get_quantized_custom_module_class',
+    'is_custom_module_class',
+    'is_observed_custom_module',
     # Sub functions for `prepare` and `swap_module`
     'propagate_qconfig_', 'add_quant_dequant', 'add_observer_', 'swap_module',
     'default_eval_fn', 'get_observer_dict',
diff --git a/torch/quantization/custom_module_class_mappings.py b/torch/quantization/custom_module_class_mappings.py
new file mode 100644
index 000000000000..c62290228c5b
--- /dev/null
+++ b/torch/quantization/custom_module_class_mappings.py
@@ -0,0 +1,75 @@
+OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS = dict()
+
+def register_observed_custom_module_mapping(float_custom_module_class, observed_custom_module_class):
+    """ Register a mapping from `float_custom_module_class` to
+    `observed_custom_module_class`
+    `observed_custom_module_class` will have a `from_float` classmethod,
+    which will return an observed custom module instance given
+    a float custom module instance.
+    This will be used in prepare step of post training static quantization or
+    quantization aware training
+    """
+    assert hasattr(observed_custom_module_class, 'from_float'), 'from_float must be' + \
+        ' defined in observed custom module class'
+    OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \
+        observed_custom_module_class
+
+def get_observed_custom_module_class(float_custom_module_class):
+    """ Get the corresponding observed module class for a given
+    float custom module.
+    """
+    observed_custom_module_class = \
+        OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None)
+    assert observed_custom_module_class is not None, \
+        'Float Custom module class {}'.format(float_custom_module_class) + \
+        ' does not have a corresponding observed module class'
+    return observed_custom_module_class
+
+QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS = dict()
+
+def register_quantized_custom_module_mapping(float_custom_module_class, quantized_custom_module_class):
+    """ Register a mapping from `float_custom_module_class` to `quantized_custom_module_class`
+    A quantized custom module class should accept quantized input and
+    return quantized output. (we can relax this condition in the
+    future if there is a need)
+    `quantized_custom_module_class` will have a `from_observed` classmethod,
+    which will return an quantized custom module instance given
+    a observed custom module instance.
+    This will be used in prepare step of post training static quantization or
+    quantization aware training
+    """
+    assert hasattr(quantized_custom_module_class, 'from_observed'), 'from_observed' + \
+        ' must be defined in quantized custom module class'
+    QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \
+        quantized_custom_module_class
+
+def get_quantized_custom_module_class(float_custom_module_class):
+    """ Get the corresponding quantized module class for a given
+    float custom module.
+    """
+    quantized_custom_module_class = \
+        QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None)
+    assert quantized_custom_module_class is not None, \
+        'Float Custom module class {}'.format(float_custom_module_class) + \
+        ' does not have a corresponding quantized module class'
+    return quantized_custom_module_class
+
+def is_custom_module_class(module_class):
+    """ Check if a given module class is a custom module class
+    """
+    return module_class in OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS and \
+        module_class in QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS
+
+def mark_observed_custom_module(module, custom_module_class):
+    """ Mark a module as observed custom module, so that
+    it can be identified during convert step
+    """
+    module._is_observed_custom_module = True
+    module._FLOAT_MODULE = custom_module_class
+
+def is_observed_custom_module(module):
+    """ Check if a module is marked as observed custom module
+    or not
+    """
+    return hasattr(module, '_is_observed_custom_module') and \
+        module._is_observed_custom_module
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 6cd06f8567d4..d06cd56a4404 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -41,6 +41,12 @@ class FakeQuantize(Module):
                            provides a method to calculate scale and zero-point.
 
     """
+
+    fake_quant_enabled: torch.Tensor
+    observer_enabled: torch.Tensor
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
     def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
         super(FakeQuantize, self).__init__()
         assert quant_min <= quant_max, \
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 4e8103d71015..852de812e39d 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -1,11 +1,9 @@
 from torch.fx import (
-    GraphModule
+    GraphModule,
+    map_arg
 )
 
-from torch.fx.graph import (
-    Graph,
-    map_arg,
-)
+from torch.fx.graph import Graph
 
 from .pattern_utils import (
     is_match,
@@ -40,7 +38,6 @@ def load_arg(a):
                 env[node.name] = self.fused_graph.node_copy(node, load_arg)
             # node matched in patterns and is not root is removed here
 
-        self.fused_graph.output(load_arg(input_graph.result))
         model = GraphModule(input_root, self.fused_graph)
         return model
 
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index ae9e92ccda26..fbdccbc5e3e2 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -25,18 +25,6 @@ def insert(fn):
 def get_quant_patterns():
     return QUANTIZATION_PATTERNS
 
-DYNAMIC_QUANTIZATION_PATTERNS = OrderedDict()
-# Register pattern for dynamic quantization
-def register_dynamic_quant_pattern(pattern):
-    def insert(fn):
-        DYNAMIC_QUANTIZATION_PATTERNS[pattern] = fn
-        return fn
-    return insert
-
-# Get patterns for dynamic quantization
-def get_dynamic_quant_patterns():
-    return DYNAMIC_QUANTIZATION_PATTERNS
-
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index fa5a8733bbf7..844351a30def 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -2,18 +2,27 @@
 from torch.fx.graph import (
     Node,
 )
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+
 from ..quantization_mappings import (
     get_static_quant_module_class,
     get_quantized_operator,
 )
+from ..custom_module_class_mappings import (
+    get_quantized_custom_module_class,
+)
 from .pattern_utils import (
     register_quant_pattern,
-    register_dynamic_quant_pattern,
 )
 from .utils import (
     _parent_name,
     quantize_node,
     get_per_tensor_qparams,
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    weight_dtype,
+    get_linear_prepack_op_for_dtype,
 )
 
 from abc import ABC, abstractmethod
@@ -232,7 +241,7 @@ def convert(self, quantizer, node, load_arg, debug=False):
 # for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
-class LinearReLU(QuantizeHandler):
+class LinearReLUQuantizeHandler(QuantizeHandler):
     def __init__(self, quantizer, node):
         super().__init__(quantizer, node)
         self.relu_node = None
@@ -245,50 +254,76 @@ def __init__(self, quantizer, node):
             self.linear = quantizer.modules[self.linear_node.target]
 
     def convert(self, quantizer, node, load_arg, debug=False):
+        qconfig = quantizer.qconfig_map[node.name]
+        activation_statically_quantized = activation_is_statically_quantized(qconfig)
         # TODO: debug option for linear module
         if self.linear_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
             assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \
                 'please make sure to run fusion before prepare'
-            # 1. attach activation post process to module
-            if type(self.linear) == torch.nn.intrinsic.LinearReLU:
-                self.linear[1].activation_post_process = quantizer.activation_post_process_map[node.name]
+            # 1. attach output activation post process to linear module
+            if node.name in quantizer.activation_post_process_map:
+                # this is the static quantization case
+                output_activation_post_process = quantizer.activation_post_process_map[node.name]
             else:
-                self.linear.activation_post_process = quantizer.activation_post_process_map[node.name]
-            # 2. select quantized class
+                output_activation_post_process = None
+
+            if output_activation_post_process:
+                if type(self.linear) == torch.nn.intrinsic.LinearReLU:
+                    float_linear_module = self.linear[1]
+                else:
+                    float_linear_module = self.linear
+                float_linear_module.activation_post_process = output_activation_post_process
+
+            # 2. select corresponding quantized linear class for the float linear class
             if type(self.linear) in [torch.nn.Linear, torch.nn.qat.Linear]:
-                qlinear = torch.nn.quantized.Linear
+                qlinear = nnq.Linear if activation_statically_quantized else nnqd.Linear
             elif type(self.linear) in [torch.nn.intrinsic.LinearReLU, torch.nn.intrinsic.qat.LinearReLU]:
+                assert activation_statically_quantized, \
+                    'Only static quantization is supported for LinearReLU'
                 qlinear = torch.nn.intrinsic.quantized.LinearReLU
             else:
                 raise Exception("unhandled linear type:", type(self.linear))
             quantized = qlinear.from_float(self.linear)
             parent_name, name = _parent_name(self.linear_node.target)
             setattr(quantizer.modules[parent_name], name, quantized)
+            # activation needs to be quantized for static quantization
             return quantizer.quantized_graph.create_node(
                 'call_module',
-                self.linear_node.target, (load_arg(quantized=True)(self.linear_node.args[0]),), {})
+                self.linear_node.target,
+                (load_arg(quantized=activation_statically_quantized)(self.linear_node.args[0]),), {})
         elif self.linear_node.op == 'call_function':
             if debug:
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                quantized_input_idxs = []
+                if activation_statically_quantized:
+                    quantized_input_idxs.append(0)
+                if weight_is_quantized(qconfig):
+                    quantized_input_idxs.append(1)
+                args = load_arg(quantized=quantized_input_idxs)(self.linear_node.args)
                 args = load_arg(quantized=False)(self.linear_node.args)
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 linear_out = quantizer.quantized_graph.create_node(
                     'call_function', torch.nn.functional.linear, args, kwargs)
-                root_module = quantizer.modules['']
-                return quantize_node(
-                    root_module,
-                    quantizer.quantized_graph,
-                    linear_out,
-                    quantizer.activation_post_process_map[self.linear_node.name])
-            else:
-                # TODO: this code can be merged with dynamic linear code
+                if activation_statically_quantized:
+                    # quantize output for statically quantized linear op
+                    root_module = quantizer.modules['']
+                    return quantize_node(
+                        root_module,
+                        quantizer.quantized_graph,
+                        linear_out,
+                        quantizer.activation_post_process_map[self.linear_node.name])
+                else:
+                    # output for dynamically quantized linear op is not quantized
+                    return linear_out
+            else:  # non-debug option
                 # linear args
                 # (x, weight, bias, ...)
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                weight_quantized = weight_is_quantized(qconfig)
+                linear_weight = load_arg(quantized=weight_quantized)(self.linear_node.args[1])
+
+                # get other arguments
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 # pack weight
-                weight = load_arg(quantized=True)(self.linear_node.args[1])
                 bias = None
                 # all args after bias, including bias
                 other_args = load_arg(quantized=False)(self.linear_node.args[2:])
@@ -300,17 +335,24 @@ def convert(self, quantizer, node, load_arg, debug=False):
                         'expect bias provided as a keyword argument when it is not a positional argument'
                     bias = kwargs['bias']
                     kwargs.pop('bias')
-                prepack_args = (weight, bias)
+                prepack_args = (linear_weight, bias)
+                prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig))
                 packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_prepack, prepack_args, {})
+                    'call_function', prepack_op, prepack_args, {})
                 # construct linear input
-                linear_input = load_arg(quantized=True)(self.linear_node.args[0])
-                activation_post_process = \
-                    quantizer.activation_post_process_map[self.linear_node.name]
-                scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                qlinear_args = (linear_input, packed_weight, scale, zero_point)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                if activation_statically_quantized:
+                    linear_input = load_arg(quantized=True)(self.linear_node.args[0])
+                    activation_post_process = \
+                        quantizer.activation_post_process_map[self.linear_node.name]
+                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
+                    qlinear_args = (linear_input, packed_weight, scale, zero_point)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                else:
+                    linear_input = load_arg(quantized=False)(self.linear_node.args[0])
+                    qlinear_args = (linear_input, packed_weight)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear_dynamic, qlinear_args, kwargs)
 
 @register_quant_pattern(torch.nn.BatchNorm2d)
 @register_quant_pattern(torch.nn.BatchNorm3d)
@@ -507,65 +549,39 @@ def convert(self, quantizer, node):
             quantizer.quantized_graph,
             node, quantizer.activation_post_process_map[node.name])
 
-# 2. Post Training Dynamic Quantizatoin Patterns
-@register_dynamic_quant_pattern(torch.nn.Linear)
-@register_dynamic_quant_pattern(torch.nn.functional.linear)
-class DynamicLinear(QuantizeHandler):
-    def __init__(self, quantizer, node):
-        super().__init__(quantizer, node)
-        self.linear_node = node
-        if node.op == 'call_module':
-            assert isinstance(quantizer.modules[node.target], torch.nn.Linear)
-            self.linear = quantizer.modules[self.linear_node.target]
-
+class CustomModuleQuantizeHandler(QuantizeHandler):
     def convert(self, quantizer, node, load_arg, debug=False):
-        if self.linear_node.op == 'call_module':
-            quantized = torch.nn.quantized.dynamic.Linear.from_float(self.linear)
-            parent_name, name = _parent_name(self.linear_node.target)
-            setattr(quantizer.modules[parent_name], name, quantized)
-            return quantizer.quantized_graph.create_node(
-                'call_module',
-                self.linear_node.target,
-                (load_arg(quantized=False)(self.linear_node.args[0]),),
-                {})
-        elif self.linear_node.op == 'call_function':
-            if debug:
-                # quantize and dequantize weight
-                args = load_arg(quantized=[1])(self.linear_node.args)
-                args = load_arg(quantized=False)(self.linear_node.args)
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.nn.functional.linear, args, kwargs)
-            else:
-                # linear args:
-                # (x, observed_weight, bias)
-                # get observer for the weight
-                weight_observer = quantizer.activation_post_process_map[self.linear_node.args[1].args[0].name]
+        """ Convert a float custom module to quantized custom module
+        """
+        assert node.op == 'call_module'
+        observed_custom_module = quantizer.modules[node.target]
+        if node.name in quantizer.activation_post_process_map:
+            observed_custom_module.activation_post_process = \
+                quantizer.activation_post_process_map[node.name]
+        quantized_custom_module_class = \
+            get_quantized_custom_module_class(observed_custom_module._FLOAT_MODULE)
+        quantized_custom_module = \
+            quantized_custom_module_class.from_observed(observed_custom_module)
+        parent_name, name = _parent_name(node.target)
+        setattr(quantizer.modules[parent_name], name, quantized_custom_module)
+        # hardcoded the qunatized input to be None (take whatever is in the environemnt),
+        # we can extend this
+        # if there is a need, e.g. get the indexes of quantized inputs from some
+        # module attribute like module._QUANTIZED_INPUT_INDEXES
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
 
-                if weight_observer.dtype == torch.float16:
-                    linear_weight = load_arg(quantized=False)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack_fp16
-                else:
-                    linear_weight = load_arg(quantized=True)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack
-                bias = None
-                # all args after bias, including bias
-                other_args = load_arg(quantized=False)(self.linear_node.args[2:])
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                if len(self.linear_node.args) > 2:
-                    bias = load_arg(quantized=False)(self.linear_node.args[2])
-                    other_args = other_args[1:]  # remove the bias argument
-                else:
-                    assert 'bias' in kwargs, \
-                        'expect bias provided as a keyword argument when it is not a positional argument'
-                    bias = kwargs['bias']
-                    kwargs.pop('bias')
-                prepack_args = (linear_weight, bias)
-                # pack weight
-                packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', prepack_op, prepack_args, {})
-                # construct dynamic linear input
-                non_quantized_input = load_arg(quantized=False)(self.linear_node.args[0])
-                qdynamic_linear_args = (non_quantized_input, packed_weight)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_dynamic, qdynamic_linear_args, kwargs)
+class StandaloneModuleQuantizeHandler(QuantizeHandler):
+    """ Converts an observed standalone module to quantized standalone module
+    by calling convert_fx on the observed standalone module.
+    """
+    def convert(self, quantizer, node, load_arg, debug=False):
+        assert node.op == 'call_module'
+        qconfig = quantizer.qconfig_map[node.name]
+        convert = torch.quantization.quantize_fx._convert_standalone_module_fx
+        observed_standalone_module = quantizer.modules[node.target]
+        quantized_standalone_module = convert(observed_standalone_module, debug=debug)
+        parent_name, name = _parent_name(node.target)
+        # update the modules dict
+        setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
+        quantizer.modules[node.target] = quantized_standalone_module
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 7967b4ec2dcb..14bd2c8eee1e 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -2,12 +2,13 @@
 from torch.fx import (
     GraphModule,
     Proxy,
+    symbolic_trace,
+    map_arg
 )
 
 from torch.fx.graph import (
     Graph,
     Node,
-    map_arg,
 )
 
 from torch.quantization import (
@@ -18,13 +19,23 @@
 from ..quantization_mappings import (
     get_qat_module_mappings,
 )
+from ..custom_module_class_mappings import (
+    is_custom_module_class,
+    get_observed_custom_module_class,
+    mark_observed_custom_module,
+    is_observed_custom_module,
+)
 
 from ..quantize import _remove_qconfig
 
 from .pattern_utils import (
     is_match,
     get_quant_patterns,
-    get_dynamic_quant_patterns,
+)
+
+from .standalone_module import (
+    mark_observed_standalone_module,
+    is_observed_standalone_module,
 )
 
 from .quantization_patterns import *
@@ -32,9 +43,14 @@
 from .utils import (
     _parent_name,
     quantize_node,
+    activation_is_statically_quantized,
 )
 
+from collections import OrderedDict
 import copy
+import re
+
+from typing import Optional
 
 # ------------------------
 # Helper Functions
@@ -100,10 +116,10 @@ def graph_module_from_producer_nodes(root, producer_nodes):
     env = {}
 
     def load_arg(a):
-        return map_arg(a, lambda node: env[node.name])
+        return map_arg(a, lambda node: env[node])
     for producer_node in producer_nodes:
-        env[producer_node.name] = graph.node_copy(producer_node, load_arg)
-    graph.output(load_arg(producer_nodes[-1].name))
+        env[producer_node] = graph.node_copy(producer_node, load_arg)
+    graph.output(load_arg(producer_nodes[-1]))
     graph_module = GraphModule(root, graph)
     return graph_module
 
@@ -126,6 +142,58 @@ def is_activation_post_process(module):
     return (isinstance(module, torch.quantization.ObserverBase) or
             isinstance(module, torch.quantization.FakeQuantize))
 
+def is_submodule_of_fake_quant(name, module, named_modules):
+    parent_name, _ = _parent_name(name)
+    return is_activation_post_process(named_modules[parent_name])
+
+def get_flattened_qconfig_dict(qconfig_dict):
+    """ flatten the global, object_type and module_name qconfig
+    to the same qconfig_dict so that it can be used by
+    propagate_qconfig_ function.
+    "module_name_regex" is ignored for now since it's not supported
+    in propagate_qconfig_, but it can be fixed later.
+
+    For example:
+    Input: {
+      "": qconfig,
+      "object_type": [
+        (torch.add, qconfig)
+      ],
+      "module_name": [
+        ("conv", qconfig)
+      ]
+    }
+
+    Output: {
+      "": qconfig,
+      torch.add: qconfig,
+      "conv": qconfig
+    }
+    """
+    flattened = dict()
+    if '' in qconfig_dict:
+        flattened[''] = qconfig_dict['']
+
+    def flatten_key(key):
+        if key in qconfig_dict:
+            for obj, qconfig in qconfig_dict[key]:
+                flattened[obj] = qconfig
+
+    flatten_key('object_type')
+    flatten_key('module_name')
+    return flattened
+
+def convert_dict_to_ordered_dict(qconfig_dict):
+    """ Convert dict in qconfig_dict to ordered dict
+    """
+    # convert a qconfig list for a type to OrderedDict
+    def _convert_to_ordered_dict(key, qconfig_dict):
+        qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, []))
+
+    _convert_to_ordered_dict('object_type', qconfig_dict)
+    _convert_to_ordered_dict('module_name_regex', qconfig_dict)
+    _convert_to_ordered_dict('module_name', qconfig_dict)
+
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
     torch.nn.functional.conv2d : [1],
@@ -171,45 +239,107 @@ def __init__(self):
     def _qat_swap_modules(self, root):
         convert(root, mapping=get_qat_module_mappings(), inplace=True, remove_qconfig=False)
 
-    def _generate_qconfig_map(self, root, input_graph):
-        def get_qconfig(module):
-            return module.qconfig if hasattr(module, 'qconfig') else None
+    def _generate_qconfig_map(self,
+                              root,
+                              input_graph,
+                              qconfig_dict):
+        global_qconfig = qconfig_dict.get('', None)
+
+        def get_module_type_qconfig(
+                module_type, fallback_qconfig=global_qconfig):
+            return qconfig_dict['object_type'].get(module_type, fallback_qconfig)
+
+        def get_function_qconfig(
+                function, fallback_qconfig=global_qconfig):
+            return qconfig_dict['object_type'].get(function, fallback_qconfig)
+
+        def get_module_name_regex_qconfig(
+                module_name, fallback_qconfig=global_qconfig):
+            for regex_pattern, qconfig in qconfig_dict['module_name_regex'].items():
+                if re.match(regex_pattern, module_name):
+                    # first match wins
+                    return qconfig
+            return fallback_qconfig
+
+        def get_module_name_qconfig(
+                module_name, fallback_qconfig=global_qconfig):
+            if module_name == '':
+                # module name qconfig not found
+                return fallback_qconfig
+            if module_name in qconfig_dict['module_name']:
+                return qconfig_dict['module_name'][module_name]
+            else:
+                parent, _ = _parent_name(module_name)
+                return get_module_name_qconfig(parent, fallback_qconfig)
+
+        # get qconfig for module_name,
+        # fallback to module_name_regex_qconfig, module_type_qconfig, global_qconfig
+        # if necessary
+        def get_qconfig(module_name):
+            module_type_qconfig = \
+                get_module_type_qconfig(type(self.modules[module_name]))
+            module_name_regex_qconfig = \
+                get_module_name_regex_qconfig(module_name, module_type_qconfig)
+            module_name_qconfig = \
+                get_module_name_qconfig(module_name, module_name_regex_qconfig)
+            return module_name_qconfig
 
         self.qconfig_map = dict()
         for node in input_graph.nodes:
             if node.op == 'get_attr':
-                parent, _ = _parent_name(node.target)
-                self.qconfig_map[node.name] = get_qconfig(self.modules[parent])
+                module_name, _ = _parent_name(node.target)
+                self.qconfig_map[node.name] = get_qconfig(module_name)
             elif node.op == 'call_function':
-                self.qconfig_map[node.name] = get_qconfig(root)
+                # precedence: [TODO] module_name_qconfig (need scope support from fx)
+                # > function_qconfig > global_qconfig
+                function_qconfig = get_function_qconfig(node.target)
+                self.qconfig_map[node.name] = function_qconfig
             elif node.op == 'call_method':
                 self_obj = node.args[0]
                 # qconfig for call_method should be the same as the `self` object for the call
                 self.qconfig_map[node.name] = self.qconfig_map[self_obj.name]
             elif node.op == 'call_module':
-                self.qconfig_map[node.name] = get_qconfig(self.modules[node.target])
-
-    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
+                module_qconfig = get_qconfig(node.target)
+                # regex is not supported eager mode propagate_qconfig_, we'll need to
+                # set the qconfig explicitly here in case regex
+                # is used
+                self.modules[node.target].qconfig = module_qconfig
+                self.qconfig_map[node.name] = module_qconfig
+
+    def _prepare(self, model, qconfig_dict, inplace, is_standalone_module):
+        """ standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+
+        When we are preparing a standalone module:
+        input of the module is observed in parent module, output of the module
+        is observed in the standalone module.
+        Returns:
+            model(GraphModule): prepared standalone module with following attributes:
+                _standalone_module_observed_input_idxs(List[Int]): a list of indexs for the graph inputs that
+                                         needs to be observed in parent module
+                _output_is_observed(Bool): a boolean variable indicate whether the output of the
+                                   custom module is observed or not
+        """
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        # TODO: allow user specified patterns
-        if self.is_dynamic_quant:
-            self.patterns = get_dynamic_quant_patterns()
-        else:
-            self.patterns = get_quant_patterns()
-
-        propagate_qconfig_(model, qconfig_dict)
+        self.patterns = get_quant_patterns()
+
+        flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
+        # TODO: support regex as well
+        propagate_qconfig_(model, flattened_qconfig_dict)
         if model.training:
             self._qat_swap_modules(model)
 
         self.modules = dict(model.named_modules())
 
+        convert_dict_to_ordered_dict(qconfig_dict)
         # map from node name to qconfig, used in _find_matches
-        self._generate_qconfig_map(model, model.graph)
+        self._generate_qconfig_map(model, model.graph, qconfig_dict)
 
         # match the patterns that will get quantized
-        matches = self._find_matches(model.graph, self.modules, self.patterns)
+        standalone_module_names = qconfig_dict.get('standalone_module_name', None)
+        matches = self._find_matches(
+            model.graph, self.modules, self.patterns, standalone_module_names)
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
@@ -217,7 +347,6 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         quants = self._find_quants(model.graph, matches)
 
         self.activation_post_process_map = dict()
-
         env = {}
         observed_graph = Graph()
         observed_node_names_set = set()
@@ -225,7 +354,21 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
+        # indexes for the inputs that needs to be observed
+        standalone_module_observed_input_idxs = []
+        graph_inputs = []
+        for node in model.graph.nodes:
+            if node.op == 'placeholder':
+                graph_inputs.append(node.name)
+
+        get_new_observer_name = get_new_attr_name_with_prefix('activation_post_process_')
+
+        result_node : Optional[Node] = None
         for node in model.graph.nodes:
+            if node.op == 'output':
+                observed_graph.output(load_arg(node.args[0]))
+                result_node = node
+                continue
             if node.name in observed_node_names_set:
                 continue
 
@@ -235,6 +378,8 @@ def load_arg(a):
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
+                if qconfig is None:
+                    continue
 
                 def insert_observer(node, observer, device):
                     get_new_observer_name = get_new_attr_name_with_prefix(prefix)
@@ -246,10 +391,39 @@ def insert_observer(node, observer, device):
                     if device:
                         getattr(model, observer_name).to(device)
 
-                # don't need to insert observer for output in dynamic quantization
-                if self.is_dynamic_quant:
+                if isinstance(obj, CustomModuleQuantizeHandler):
+                    custom_module = self.modules[node.target]
+                    observed_custom_module_class = \
+                        get_observed_custom_module_class(type(custom_module))
+                    observed_custom_module = \
+                        observed_custom_module_class.from_float(custom_module)
+                    mark_observed_custom_module(observed_custom_module, type(custom_module))
+                    parent_name, name = _parent_name(node.target)
+                    setattr(self.modules[parent_name], name, observed_custom_module)
+
+                # index for input of custom module that needs to be observed in parent
+                standalone_module_input_idxs = None
+                if isinstance(obj, StandaloneModuleQuantizeHandler):
+                    # observe standalone module
+                    standalone_module = self.modules[node.target]
+                    traced_standalone_module = symbolic_trace(standalone_module)
+                    prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
+                    observed_standalone_module = prepare(traced_standalone_module, {'': qconfig})
+                    observed_standalone_module.qconfig = qconfig
+                    standalone_module_input_idxs = observed_standalone_module._standalone_module_observed_input_idxs
+                    observed_standalone_module = mark_observed_standalone_module(observed_standalone_module)
+                    parent_name, name = _parent_name(node.target)
+                    setattr(self.modules[parent_name], name, observed_standalone_module)
+                    self.modules[node.target] = observed_standalone_module
+
+
+                # don't need to insert observer for output if activation does not
+                # need to be statically quantized
+                if not activation_is_statically_quantized(qconfig):
                     continue
 
+                # inserting observers for output of observed module, or mark the output
+                # as observed
                 if isinstance(obj, CopyNode):
                     assert node.op in [
                         'call_module',
@@ -268,20 +442,39 @@ def is_observed(input_arg):
                 elif (isinstance(obj, Add) or isinstance(obj, Mul)) and not obj.all_nodes:
                     if node.args[0].name in observed_node_names_set:
                         observed_node_names_set.add(node.name)
+                elif isinstance(obj, StandaloneModuleQuantizeHandler):
+                    assert node.op == 'call_module'
+                    output_is_observed = self.modules[node.target]._output_is_observed
+                    if output_is_observed:
+                        observed_node_names_set.add(node.name)
                 elif qconfig is not None and obj.all_nodes:
                     # observer for outputs
                     new_observer = qconfig.activation()
                     # respect device affinity when adding observers
                     device = assert_and_get_unique_device(model)
                     insert_observer(node, new_observer, device)
+
+                # insert observer for input of standalone module
+                if standalone_module_input_idxs is not None:
+                    for idx in standalone_module_input_idxs:
+                        if node.args[idx].name not in observed_node_names_set:
+                            new_observer = qconfig.activation()
+                            device = assert_and_get_unique_device(model)
+                            insert_observer(node.args[idx], new_observer, device)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
             if node.name not in observed_node_names_set and node.name in quants:
+                if is_standalone_module and node.name in graph_inputs:
+                    # we'll insert observer for input of standalone module
+                    # in parent graph
+                    standalone_module_observed_input_idxs.append(graph_inputs.index(node.name))
+                    continue
                 get_new_observer_name = get_new_attr_name_with_prefix(prefix)
                 observer_name = get_new_observer_name(model)
                 _, qconfig, is_weight = quants[node.name]
                 if qconfig is not None:
+                    # TODO: use insert_observer
                     new_observer = \
                         qconfig.weight() if is_weight else qconfig.activation()
                     # respect device affinity when adding observers
@@ -292,10 +485,18 @@ def is_observed(input_arg):
                     setattr(model, observer_name, self.activation_post_process_map[node.name])
                     env[node.name] = observed_graph.create_node('call_module', observer_name, (load_arg(node),), {})
                     observed_node_names_set.add(node.name)
-        observed_graph.output(load_arg(model.graph.result))
 
         model = GraphModule(model, observed_graph)
         self.save_state(model)
+        if is_standalone_module:
+            assert result_node is not None
+            assert isinstance(result_node.args[0], Node), \
+                'standalone module returning dict is not yet supported'
+            # indicator for whether output is observed or not.
+            # This used for correctly quantize standalone modules
+            output_is_observed = result_node.args[0].name in observed_node_names_set
+            model._standalone_module_observed_input_idxs = standalone_module_observed_input_idxs
+            model._output_is_observed = output_is_observed
         return model
 
     def save_state(self, observed):
@@ -315,17 +516,14 @@ def restore_state(self, observed):
         self.patterns = observed._patterns
         self.qconfig_map = observed._qconfig_map
 
-    def prepare(self, model, qconfig_dict, inplace=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False)
-
-    def prepare_dynamic(self, model, qconfig_dict, inplace=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True)
+    def prepare(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
+        return self._prepare(model, qconfig_dict, inplace, is_standalone_module=is_standalone_module)
 
     def _run_weight_observers(self, observed):
-        r''' Extract the subgraph that produces the weight for dynamically quantized
-        node and run the subgraph to observe the weight.
-        Note that the observers of dynamically quantized modules are run during
-        the conversion step.
+        r''' Extract the subgraph that produces the weight for dynamic quant
+        or weight only quant node and run the subgraph to observe the weight.
+        Note that the observers of dynamic quant or weight only quant ops are run during
+        the convert step.
         '''
         for node in observed.graph.nodes:
             if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
@@ -340,26 +538,40 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False):
+    def _convert(self, model, inplace=False, debug=False, is_standalone_module=False):
+        """ standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+        For standalone module: the inputs will be quantized by parent module,
+        checks `_standalone_module_observed_input_idxs` of
+        input observed model and will treat these inputs as quantized
+        also will not dequantize the final output.
+        Returns a quantized standalone module which accepts quantized input(if needed)
+        and produces quantized output (if needed).
+        """
         self.restore_state(model)
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        # run weight observers before inserting quant dequant nodes
-        # for dynamic quantization
-        if self.is_dynamic_quant:
-            self._run_weight_observers(model)
+        # always run weight observers in the top level forward method
+        # for dynamic quant ops or weight only quant ops
+        self._run_weight_observers(model)
 
         # move to cpu since we only have quantized cpu kernels
         model.eval().cpu()
         self.modules = dict(model.named_modules())
 
         matches = self._find_matches(model.graph, self.modules, self.patterns)
+
         quants = self._find_quants(model.graph, matches)
+
         self.quantized_graph = Graph()
         env = {}
         quant_env = {}
 
+        graph_inputs = []
+        for node in model.graph.nodes:
+            if node.op == 'placeholder':
+                graph_inputs.append(node.name)
+
         def load_non_quantized(n):
             if n.name not in env:
                 assert n.name in quant_env, \
@@ -436,13 +648,26 @@ def is_quantized(node):
                     raise Exception("partially quantized inputs in list not handled yet")
 
         for node in model.graph.nodes:
+            if node.op == 'output':
+                if is_standalone_module:
+                    # result are kept quantized in the quantized standalone module
+                    graph_output = map_arg(node.args[0], load_x)
+                else:
+                    graph_output = map_arg(node.args[0], load_non_quantized)
+                self.quantized_graph.output(graph_output)
+                continue
             root_node, matched, obj, qconfig = matches.get(node.name, (None, None, None, None))
             if root_node is node:
                 if qconfig is None:
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
                     quantized = False
                 else:
-                    result = obj.convert(self, node, load_arg)
+                    result = obj.convert(self, node, load_arg, debug=debug)
+                    if node.op == 'call_module' and is_observed_standalone_module(self.modules[node.target]):
+                        quantized = self.modules[node.target]._output_is_observed
+                    else:
+                        quantized = True
+
                     # Need to get correct quantized/non-quantized state for the output of CopyNode
                     if isinstance(obj, CopyNode):
                         assert node.op in [
@@ -451,11 +676,8 @@ def is_quantized(node):
                             'call_method'], \
                             'CopyNode of type ' + node.op + ' is not handled'
                         quantized = is_quantized(node.args[0])
-                    else:
-                        quantized = True
 
-                    # output of dynamic quantization is not quantized
-                    if self.is_dynamic_quant:
+                    if not activation_is_statically_quantized(qconfig):
                         quantized = False
 
                 if quantized:
@@ -490,9 +712,14 @@ def is_quantized(node):
                         root_module, self.quantized_graph,
                         load_non_quantized(node.args[0]), observer_module)
                     continue
-            # dequantize inputs for the node that are not quantized
-            env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
-        self.quantized_graph.output(map_arg(model.graph.result, load_non_quantized))
+
+            if is_standalone_module and node.op == 'placeholder' and \
+               graph_inputs.index(node.name) in model._standalone_module_observed_input_idxs:
+                # the node is quantized in parent module
+                quant_env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
+            else:
+                # dequantize inputs for the node that are not quantized
+                env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
 
         # remove activation post process
         act_post_process_removed_graph = Graph()
@@ -501,17 +728,20 @@ def is_quantized(node):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
         for node in self.quantized_graph.nodes:
+            if node.op == 'output':
+                act_post_process_removed_graph.output(map_arg(node.args[0], load_arg))
+                continue
             if node.op == 'call_module' and \
                is_activation_post_process(self.modules[node.target]):
-                # remove activation post process
+                # remove activation post process node
                 env[node.name] = env[node.args[0].name]
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
-        act_post_process_removed_graph.output(map_arg(self.quantized_graph.result, load_arg))
 
+        module_dict = dict(model.named_modules())
         to_be_removed = []
         for name, module in model.named_modules():
-            if is_activation_post_process(module):
+            if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict):
                 to_be_removed.append(name)
         for n in to_be_removed:
             delattr(model, n)
@@ -564,17 +794,16 @@ def load_arg(a):
             else:
                 # copy other nodes
                 env[node.name] = folded_graph.node_copy(node, load_arg)
-        folded_graph.output(load_arg(quantized_graph.result))
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, is_dynamic=False):
-        quantized = self._convert(model, inplace, debug, is_dynamic)
+    def convert(self, model, inplace=False, debug=False, is_standalone_module=False):
+        quantized = self._convert(model, inplace, debug, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
 
-    def _find_matches(self, graph, modules, patterns):
+    def _find_matches(self, graph, modules, patterns, standalone_module_names=None):
         """
         Matches the nodes in the input graph to quantization patterns, and
         outputs the information needed to quantize them in future steps.
@@ -619,6 +848,31 @@ def record_match(pattern, node, matched):
                             all_matched.add(n.name)
                         # break after finding the first match
                         break
+
+        # add custom module instances to the match result
+        for node in graph.nodes:
+            if node.op == 'call_module' and \
+               (is_custom_module_class(type(self.modules[node.target])) or
+                    is_observed_custom_module(self.modules[node.target])):
+                custom_module_qconfig = self.qconfig_map[node.name]
+                match_map[node.name] = (
+                    node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
+
+        def is_standalone_module(module_path):
+            if standalone_module_names is None:
+                return False
+            return module_path in standalone_module_names
+
+        # add standalone modules to the match
+        for node in graph.nodes:
+            if node.op == 'call_module' and \
+               (is_standalone_module(node.target) or
+                    is_observed_standalone_module(self.modules[node.target])):
+                # add node to matched nodes
+                custom_module_qconfig = self.qconfig_map[node.name]
+                match_map[node.name] = (
+                    node, [node], StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
+
         return match_map
 
     def _find_quants(self, graph, matches):
@@ -646,7 +900,8 @@ def visit_arg(arg):
                     for i, node_arg in enumerate(node.args):
                         if arg is node_arg and i in WEIGHT_INDEX_DICT[node.target]:
                             is_weight = True
-                if (not self.is_dynamic_quant) or is_weight:
+                if qconfig is not None and \
+                   (activation_is_statically_quantized(qconfig) or is_weight):
                     # overwrite previous quant config
                     quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
             return visit_arg
@@ -664,5 +919,9 @@ def visit_arg(arg):
                     map_arg(matched[-1].args, visit(matched[-1], qconfig))
                     map_arg(matched[-1].kwargs, visit(matched[-1], qconfig))
                     # output
+                    if isinstance(obj, StandaloneModuleQuantizeHandler):
+                        # we don't insert observer for output of custom
+                        # module
+                        continue
                     map_arg(matched[0], visit(None, qconfig))
         return quants
diff --git a/torch/quantization/fx/standalone_module.py b/torch/quantization/fx/standalone_module.py
new file mode 100644
index 000000000000..55aa8e21f98f
--- /dev/null
+++ b/torch/quantization/fx/standalone_module.py
@@ -0,0 +1,30 @@
+import torch
+import copy
+from torch.fx import GraphModule
+
+class ObservedStandaloneGraphModule(GraphModule):
+    _PRESERVED_ATTR_NAMES = [
+        '_activation_post_process_map',
+        '_patterns',
+        '_qconfig_map',
+        '_standalone_module_observed_input_idxs',
+        '_output_is_observed']
+
+    def __init__(self, root, graph):
+        preserved_attrs = dict()
+        for attr in self._PRESERVED_ATTR_NAMES:
+            preserved_attrs[attr] = getattr(root, attr)
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedStandaloneGraphModule(fake_mod, self.graph)
+
+def mark_observed_standalone_module(module):
+    return ObservedStandaloneGraphModule(module, module.graph)
+
+def is_observed_standalone_module(module):
+    return isinstance(module, ObservedStandaloneGraphModule)
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 5d5532dc48fc..98f94a0633a0 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -138,3 +138,30 @@ def get_next_qparams_idx(module, qparams):
         qparam_full_path = key + str(idx)
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
+
+def activation_is_statically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    statically quantized or not
+    """
+    assert qconfig is not None
+    activation = qconfig.activation()
+    return activation.dtype in [torch.quint8, torch.qint8]
+
+def weight_dtype(qconfig):
+    assert qconfig is not None
+    weight = qconfig.weight()
+    return weight.dtype
+
+def weight_is_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8]
+
+def get_linear_prepack_op_for_dtype(dtype):
+    if dtype == torch.float16:
+        return torch.ops.quantized.linear_prepack_fp16
+    elif dtype == torch.qint8:
+        return torch.ops.quantized.linear_prepack
+    else:
+        raise Exception("can't get linear prepack op for dtype:", dtype)
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 163bd037467e..5c8257d213e1 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -2,10 +2,11 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 from functools import partial
-from typing import List, Tuple, Optional
-
+from typing import Any, List, Tuple, Optional, Dict, Union
+from collections import OrderedDict
 import torch
 import torch.nn as nn
+import re
 
 def _with_args(cls_or_self, **kwargs):
     r"""Wrapper that allows creation of class factories.
@@ -37,7 +38,7 @@ def __repr__(self):
     return r
 
 
-ABC = ABCMeta(str("ABC"), (object,), {})  # compatible with Python 2 *and* 3:
+ABC: Any = ABCMeta(str("ABC"), (object,), {})  # compatible with Python 2 *and* 3:
 
 
 class ObserverBase(ABC, nn.Module):
@@ -110,6 +111,8 @@ class _ObserverBase(ObserverBase):
     #   min_val and max_val buffers from torch.Size([0]) to torch.Size([])
     _version = 2
 
+    eps: torch.Tensor
+
     def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
                  reduce_range=False, quant_min=None, quant_max=None):
         super(_ObserverBase, self).__init__(dtype=dtype)
@@ -154,8 +157,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                                                         missing_keys, unexpected_keys, error_msgs)
 
     @torch.jit.export
-    def _validate_qmin_qmax(self, quant_min, quant_max):
-        # type: (int, int) -> None
+    def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
         r"""Validates that the user-specified quantization range is properly initialized
         and within the given bound supported by the observer dtype.
 
@@ -175,8 +177,7 @@ def _validate_qmin_qmax(self, quant_min, quant_max):
         assert quant_min < quant_max, "qmin must be strictly less than qmax for user-specified quantization range."
 
     @torch.jit.export
-    def _calculate_qmin_qmax(self):
-        # type: () -> Tuple[int, int]
+    def _calculate_qmin_qmax(self) -> Tuple[int, int]:
         r"""Calculates actual qmin and qmax based on the quantization range,
         observer datatype and if range is reduced.
         """
@@ -215,8 +216,7 @@ def _calculate_qmin_qmax(self):
         return quant_min, quant_max
 
     @torch.jit.export
-    def _calculate_qparams(self, min_val, max_val):
-        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+    def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         r"""Calculates the quantization parameters, given min and max
         value tensors. Works for both per tensor and per channel cases
 
@@ -361,6 +361,8 @@ class MinMaxObserver(_ObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scale
               and zero_point are set to 1.0 and 0.
     """
+    min_val: torch.Tensor
+    max_val: torch.Tensor
 
     def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
                  reduce_range=False, quant_min=None, quant_max=None):
@@ -475,82 +477,6 @@ def forward(self, x_orig):
         self.max_val.copy_(max_val)
         return x_orig
 
-
-class MinMaxDynamicQuantObserver(MinMaxObserver):
-    r"""Observer module for computing the quantization parameters based on the
-    tensor min and max values in dynamic quantization.
-
-    This observer will mimic the quantization steps followed in the operator
-    to compute the activation tensor quantization parameters at run-time.
-
-    Args:
-        dtype: Quantized data type
-        qscheme: Quantization scheme to be used
-        reduce_range: Reduces the range of the quantized data type by 1 bit
-
-    .. warning:: Only works with ``torch.per_tensor_symmetric`` quantization scheme
-
-    .. warning:: :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
-
-    .. note:: If the running minimum equals to the running maximum, the scale
-              and zero_point are set to 0.1 and 0.
-    """
-
-    @torch.jit.export
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-
-        if self.max_val == float('-inf') and self.min_val == float('inf'):
-            return torch.tensor([1.0]), torch.tensor([0])
-
-        assert self.min_val <= self.max_val, "min {} should be less than max {}".format(
-            self.min_val, self.max_val
-        )
-
-        if self.dtype == torch.qint8:
-            if self.reduce_range:
-                qmin, qmax = -64, 63
-            else:
-                qmin, qmax = -128, 127
-        else:  # dtype == torch.quint8
-            if self.reduce_range:
-                qmin, qmax = 0, 127
-            else:
-                qmin, qmax = 0, 255
-
-        max_val, min_val = self.max_val.to(dtype=torch.float), self.min_val.to(dtype=torch.float)
-
-        # Extend the min_val and max_val to ensure that it contains 0.
-        min_val = torch.min(min_val, torch.tensor(0.).to(dtype=torch.float))
-        max_val = torch.max(max_val, torch.tensor(0.).to(dtype=torch.float))
-
-        scale = (max_val.to(dtype=torch.double) - min_val) / float(qmax - qmin)
-
-        if scale == 0.0 or torch.isinf(1.0 / scale):
-            scale = torch.tensor(0.1).to(dtype=torch.float)
-            zero_point = 0
-
-        zero_point_from_min = qmin - min_val / scale.to(dtype=torch.double)
-        zero_point_from_max = qmax - max_val / scale.to(dtype=torch.double)
-        zero_point_from_min_error = abs(qmin) - abs(min_val / scale.to(dtype=torch.double))
-        zero_point_from_max_error = abs(qmax) - abs(max_val / scale.to(dtype=torch.double))
-
-        if zero_point_from_min_error < zero_point_from_max_error:
-            initial_zero_point = zero_point_from_min
-        else:
-            initial_zero_point = zero_point_from_max
-
-        nudged_zero_point = 0
-
-        if initial_zero_point < qmin:
-            nudged_zero_point = qmin
-        elif initial_zero_point > qmax:
-            nudged_zero_point = qmax
-        else:
-            nudged_zero_point = int(initial_zero_point.round())
-
-        return scale.to(dtype=torch.float), torch.tensor([nudged_zero_point])
-
 class PerChannelMinMaxObserver(_ObserverBase):
     r"""Observer module for computing the quantization parameters based on the
     running per channel min and max values.
@@ -576,6 +502,9 @@ class PerChannelMinMaxObserver(_ObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scales
               and zero_points are set to 1.0 and 0.
     """
+    min_vals: torch.Tensor
+    max_vals: torch.Tensor
+
 
     def __init__(self, ch_axis=0, dtype=torch.quint8,
                  qscheme=torch.per_channel_affine, reduce_range=False,
@@ -633,9 +562,10 @@ def calculate_qparams(self):
     def extra_repr(self):
         return "min_val={}, max_val={}".format(self.min_vals, self.max_vals)
 
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-
+    @torch.jit.export
+    def _load_from_state_dict(self, state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], prefix: str,
+                              local_metadata: Dict[str, torch.Tensor], strict: bool,
+                              missing_keys: List[str], unexpected_keys: List[str], error_msgs: List[str]):
         local_state = ['min_vals', 'max_vals']
         for name in local_state:
             key = prefix + name
@@ -649,10 +579,26 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                     self.min_vals.resize_(val.shape)
                 else:
                     self.max_vals.resize_(val.shape)
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == 'min_vals':
+                        self.min_vals.copy_(val)
+                    else:
+                        self.max_vals.copy_(val)
             elif strict:
                 missing_keys.append(key)
-        super(PerChannelMinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict,
-                                                                    missing_keys, unexpected_keys, error_msgs)
+
+        if not torch.jit.is_scripting():
+            super(PerChannelMinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                                                        missing_keys, unexpected_keys, error_msgs)
+
+    @torch.jit.export
+    def _load_from_state_dict_script(self, state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
+                                     prefix: str, local_metadata: Dict[str, torch.Tensor], strict: bool,
+                                     missing_keys: List[str], unexpected_keys: List[str], error_msgs: List[str]):
+
+        self._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
 class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
     r"""Observer module for computing the quantization parameters based on the
@@ -737,6 +683,9 @@ class HistogramObserver(_ObserverBase):
     3. Compute the scale and zero point the same way as in the
         :class:`~torch.quantization.MinMaxObserver`
     """
+    histogram: torch.Tensor
+    min_val: torch.Tensor
+    max_val: torch.Tensor
 
     def __init__(self, bins=2048, upsample_rate=128, dtype=torch.quint8,
                  qscheme=torch.per_tensor_affine, reduce_range=False):
@@ -879,8 +828,10 @@ def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
         return new_min, new_max
 
     @torch.jit.ignore
-    def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
-        # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor, int, int]
+    def _adjust_min_max(self,
+                        combined_min: torch.Tensor,
+                        combined_max: torch.Tensor,
+                        upsample_rate: int) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
         # We ensure that:
         # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
         # This allows us to have a common grid of resolution s, where we can align
@@ -888,17 +839,22 @@ def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
         # start_idx maps min_val to the histogram bin index.
 
         hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate)
-        downsample_rate = torch.ceil((combined_max - combined_min) / (self.bins * hist_bin_width)).to(torch.int).item()
+        downsample_rate = int(torch.ceil((combined_max - combined_min) / (self.bins * hist_bin_width)).item())
         e = downsample_rate * (self.bins * hist_bin_width) - (combined_max - combined_min)
         # Relax only the max, not the min, so that for one sided distributions, min stays at zero
         combined_max = combined_max + e
         combined_min = combined_min
-        start_idx = torch.round((self.min_val - combined_min) / hist_bin_width).to(torch.int).item()
+        start_idx = int(torch.round((self.min_val - combined_min) / hist_bin_width).item())
         return combined_min, combined_max, downsample_rate, start_idx
 
     @torch.jit.ignore
-    def _combine_histograms(self, orig_hist, new_hist, upsample_rate, downsample_rate, start_idx, Nbins):
-        # type: (Tensor, Tensor, int, int, int, int) -> Tensor
+    def _combine_histograms(self,
+                            orig_hist: torch.Tensor,
+                            new_hist: torch.Tensor,
+                            upsample_rate: int,
+                            downsample_rate: int,
+                            start_idx: int,
+                            Nbins: int) -> torch.Tensor:
         # First up-sample the histogram with new data by a factor of L
         # This creates an approximate probability density thats piecwise constant
         upsampled_histogram = new_hist.repeat_interleave(upsample_rate)
@@ -920,7 +876,7 @@ def _combine_histograms(self, orig_hist, new_hist, upsample_rate, downsample_rat
         return orig_hist
 
     def forward(self, x_orig):
-        # type: (Tensor) -> Tensor
+        # type: (torch.Tensor) -> torch.Tensor
         x = x_orig.detach()
         min_val = self.min_val
         max_val = self.max_val
@@ -932,7 +888,10 @@ def forward(self, x_orig):
             self.min_val.copy_(min_val)
             self.max_val.resize_(max_val.shape)
             self.max_val.copy_(max_val)
-            torch.histc(x, self.bins, min=min_val, max=max_val, out=self.histogram)
+            assert min_val.numel() == 1 and max_val.numel() == 1, (
+                "histogram min/max values must be scalar."
+            )
+            torch.histc(x, self.bins, min=int(min_val), max=int(max_val), out=self.histogram)
         else:
             new_min, new_max = torch._aminmax(x)
             combined_min = torch.min(new_min, min_val)
@@ -942,7 +901,10 @@ def forward(self, x_orig):
             # and then downsampling the histogram efficiently
             combined_min, combined_max, downsample_rate, start_idx = \
                 self._adjust_min_max(combined_min, combined_max, self.upsample_rate)
-            combined_histogram = torch.histc(x, self.bins, min=combined_min, max=combined_max)
+            assert combined_min.numel() == 1 and combined_max.numel() == 1, (
+                "histogram min/max values must be scalar."
+            )
+            combined_histogram = torch.histc(x, self.bins, min=int(combined_min), max=int(combined_max))
             if combined_min == min_val and combined_max == max_val:
                 combined_histogram += self.histogram
             else:
@@ -1025,10 +987,15 @@ class PlaceholderObserver(ObserverBase):
         custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
                         (Can be used in Graph Mode Passes for special case ops).
     """
-    def __init__(self, dtype=torch.float16, custom_op_name=""):
+    def __init__(self, dtype=torch.float16, custom_op_name="", compute_dtype=None):
         super(PlaceholderObserver, self).__init__(dtype=dtype)
+        # dtype of input of the target operator, e.g. for dynamic quantization
+        # ops, the dtype will be float32
         self.dtype = dtype
         self.custom_op = custom_op_name
+        # used for configuration of computation type for dynamic quantization
+        if compute_dtype:
+            self.compute_dtype = compute_dtype
 
     def forward(self, x):
         return x
@@ -1092,6 +1059,68 @@ def forward(self, x):
     def calculate_qparams(self):
         raise Exception("calculate_qparams should not be called for NoopObserver")
 
+def _is_observer_script_module(mod, obs_type_name):
+    ''' Returns true if given mod is an instance of Observer script module.
+    '''
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.quantization.observer.___torch_mangle_2.MinMaxObserver'
+        suffix = mod._c.qualified_name.split('.', 1)[1]
+        name = re.sub(r'\.___torch_mangle_\d+', '', suffix)
+        return obs_type_name in name
+    return False
+
+def _is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantize) or
+            _is_observer_script_module(module, 'torch.quantization.observer'))
+
+def _is_per_channel_script_obs_instance(module):
+    if isinstance(module, torch.jit.RecursiveScriptModule):
+        return _is_observer_script_module(module, "torch.quantization.observer.PerChannelMinMaxObserver") or\
+            _is_observer_script_module(module, "torch.quantization.observer.MovingAveragePerChannelMinMaxObserver")
+    return False
+
+def get_observer_state_dict(mod):
+    r"""
+    Returns the state dict corresponding to the observer stats.
+    Traverse the model state_dict and extract out the stats.
+    """
+    od = OrderedDict()
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        for k, v in mod.state_dict().items():
+            if 'observer' in k:
+                od[k] = v
+    else:
+        # path for GraphModule and nn.Module (eager mode)
+        for k, v in mod.state_dict().items():
+            if 'activation_post_process' in k:
+                od[k] = v
+    od._metadata = mod.state_dict()._metadata  # type: ignore[attr-defined]
+    return od
+
+def load_observer_state_dict(mod, obs_dict):
+    r"""
+    Given input model and a state_dict containing model observer stats,
+    load the stats back into the model. The observer state_dict can be saved
+    using torch.quantization.get_observer_state_dict
+    """
+    missing_keys: List[str] = []
+    unexpected_keys: List[str] = []
+    for name, module in mod.named_modules():
+        prefix = name + '.'
+        if _is_activation_post_process(module):
+            if _is_per_channel_script_obs_instance(module):
+                # For per-channel observers we need to call a custom load_from_state_dict to resize the tensor.
+                # However this is not called when the module is scripted and we end up calling the default one in module.py
+                module._load_from_state_dict_script(obs_dict, prefix, {}, True, missing_keys, unexpected_keys, [])
+            else:
+                module._load_from_state_dict(obs_dict, prefix, {}, False, missing_keys, unexpected_keys, [])
+    for k in missing_keys:
+        if 'observer' in k or 'activation_post_process' in k:
+            raise Exception("Missing keys for observer {} in state_dict".format(k))
+    for k in unexpected_keys:
+        if 'observer' in k or 'activation_post_process' in k:
+            raise Exception("Unexpected keys for observer {} in state_dict".format(k))
 
 # Restrict activations to be in the range (0,127)
 default_observer = MinMaxObserver.with_args(reduce_range=True)
@@ -1099,7 +1128,7 @@ def calculate_qparams(self):
 default_weight_observer = MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric)
 default_histogram_observer = HistogramObserver.with_args(reduce_range=True)
 default_per_channel_weight_observer = PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)
-default_dynamic_quant_observer = MinMaxDynamicQuantObserver
+default_dynamic_quant_observer = PlaceholderObserver.with_args(dtype=torch.float, compute_dtype=torch.quint8)
 default_float_qparams_observer = PerChannelMinMaxObserver.with_args(dtype=torch.quint8,
                                                                     qscheme=torch.per_channel_affine_float_qparams,
                                                                     ch_axis=0)
diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 585b018a5b01..60d166ae4896 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -21,6 +21,8 @@
     nn.Conv1d: nnq.Conv1d,
     nn.Conv2d: nnq.Conv2d,
     nn.Conv3d: nnq.Conv3d,
+    nn.ConvTranspose1d: nnq.ConvTranspose1d,
+    nn.ConvTranspose2d: nnq.ConvTranspose2d,
     nn.BatchNorm2d: nnq.BatchNorm2d,
     nn.BatchNorm3d: nnq.BatchNorm3d,
     nn.LayerNorm: nnq.LayerNorm,
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 8bc3b6ffc532..19a27e62ac5b 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -14,6 +14,14 @@
                                     get_qat_module_mappings,
                                     get_qconfig_propagation_list)
 
+from .custom_module_class_mappings import (
+    is_custom_module_class,
+    get_observed_custom_module_class,
+    get_quantized_custom_module_class,
+    mark_observed_custom_module,
+    is_observed_custom_module,
+)
+
 from .stubs import DeQuantStub, QuantWrapper
 from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig
 
@@ -117,38 +125,52 @@ def get_activation_post_process(qconfig, device):
             activation.to(device)
         return activation
 
-    for child in module.children():
+    def needs_observation(m):
+        return hasattr(m, 'qconfig') and m.qconfig is not None
+
+    def insert_activation_post_process(m):
+        """ Adds an activation post process module and register
+        a post hook that calls the module
+        """
+        if needs_observation(m):
+            # observer and hook will be gone after we swap the module
+            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device))
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            handle = register_activation_post_process_hook(m)
+            m._forward_hooks.move_to_end(handle.id, last=False)
+
+    for name, child in module.named_children():
         if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional:
             if hasattr(child, 'qconfig') and child.qconfig is not None:
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
         elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
-            if hasattr(child, 'qconfig') and child.qconfig is not None:
-                child.add_module('activation_post_process', get_activation_post_process(child.qconfig, device))
-                register_activation_post_process_hook(child)
-
+            insert_activation_post_process(child)
+            # TODO: remove
+            if needs_observation(child):
                 # Attaching prehook
                 if prehook is not None:
                     child.add_module('activation_pre_process', prehook())
                     child.register_forward_pre_hook(_observer_forward_pre_hook)
+        elif needs_observation(child) and is_custom_module_class(type(child)):
+            observed_child = get_observed_custom_module_class(type(child)).from_float(child)
+            mark_observed_custom_module(observed_child, type(child))
+            setattr(module, name, observed_child)
+            insert_activation_post_process(observed_child)
         else:
             add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, prehook)
 
     # Insert observers only for leaf nodes, note that this observer is for
     # the output of the module, for input QuantStub will observe them
-    if hasattr(module, 'qconfig') and module.qconfig is not None and \
-       len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
+    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
        and type(module) in qconfig_propagation_list:
-        # observer and hook will be gone after we swap the module
-        module.add_module('activation_post_process', get_activation_post_process(module.qconfig, device))
-        # Register observer as the first entry in the hook list
-        # All post forward hooks are preserved and will be executed after the observer before convert
-        handle = register_activation_post_process_hook(module)
-        module._forward_hooks.move_to_end(handle.id, last=False)
-
-        # Attaching prehook
-        if prehook is not None:
-            module.add_module('activation_pre_process', prehook())
-            module.register_forward_pre_hook(_observer_forward_pre_hook)
+        insert_activation_post_process(module)
+        # TOOD: remove
+        if needs_observation(module):
+            # Attaching prehook
+            if prehook is not None:
+                module.add_module('activation_pre_process', prehook())
+                module.register_forward_pre_hook(_observer_forward_pre_hook)
 
 def get_unique_devices_(module):
     return {p.device for p in module.parameters()} | \
@@ -429,7 +451,10 @@ def _convert(module, mapping=None, inplace=False):
                          nniqat.ConvBnReLU2d)
 
     for name, mod in module.named_children():
-        if type(mod) not in SWAPPABLE_MODULES:
+        # both swappable modules and observed custom modules are
+        # swapped as one unit
+        if type(mod) not in SWAPPABLE_MODULES and \
+           not is_observed_custom_module(mod):
             _convert(mod, mapping, inplace=True)
         reassign[name] = swap_module(mod, mapping)
 
@@ -452,15 +477,15 @@ def swap_module(mod, mapping):
     new_mod = mod
     # Always replace dequantstub with dequantize
     if hasattr(mod, 'qconfig') and mod.qconfig is not None or type(mod) == DeQuantStub:
-        if type(mod) in mapping:
-            # respect device affinity when swapping modules
-            devices = get_unique_devices_(mod)
-            assert len(devices) <= 1, (
-                "swap_module only works with cpu or single-device CUDA modules, "
-                "but got devices {}".format(devices)
-            )
-            device = next(iter(devices)) if len(devices) > 0 else None
+        swapped = False
+        if is_observed_custom_module(mod):
+            new_mod = get_quantized_custom_module_class(mod._FLOAT_MODULE).from_observed(mod)
+            swapped = True
+        elif type(mod) in mapping:
             new_mod = mapping[type(mod)].from_float(mod)
+            swapped = True
+
+        if swapped:
             # Preserve module's pre forward hooks. They'll be called on quantized input
             for pre_hook_fn in mod._forward_pre_hooks.values():
                 new_mod.register_forward_pre_hook(pre_hook_fn)
@@ -469,6 +494,14 @@ def swap_module(mod, mapping):
             for hook_fn in mod._forward_hooks.values():
                 if hook_fn is not _observer_forward_hook:
                     new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = get_unique_devices_(mod)
+            assert len(devices) <= 1, (
+                "swap_module only works with cpu or single-device CUDA modules, "
+                "but got devices {}".format(devices)
+            )
+            device = next(iter(devices)) if len(devices) > 0 else None
             if device:
                 new_mod.to(device)
     return new_mod
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 77178552ee71..7fa3fcbee82f 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -20,28 +20,112 @@ def fuse_fx(graph_module, inplace=False):
     fuser = Fuser()
     return fuser.fuse(graph_module, inplace)
 
-def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant):
+def _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=False):
+    r""" Internal helper function for prepare_fx
+    Args:
+      `graph_modul`e, `qconfig_dict`, `inplace`: see docs for :func:`~torch.quantization.prepare_fx`
+      `is_standalone_module`: a boolean flag indicates whether we are
+      quantizing a standalone module or not, a standalone module
+      is a submodule of the parent module that is not inlined in the
+forward graph of the parent module,
+      the way we quantize standalone module is described in:
+      :func:`~torch.quantization._prepare_standalone_module_fx`
+    """
     _check_is_graph_module(graph_module)
     graph_module = fuse_fx(graph_module, inplace)
     quantizer = Quantizer()
-    prepare = quantizer.prepare_dynamic if is_dynamic_quant else quantizer.prepare
-    prepared = prepare(graph_module, qconfig_dict, inplace=True)
-    return prepared
+    return quantizer.prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
+
+def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
+    r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
+    parent module.
+    standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+
+    input of the module is quantized in parent module, output of the module
+    is quantized in the standalone module.
+    Extra attributes in output GraphModule while preparing a standalone module:
+        _standalone_module_observed_input_idxs(List[Int]): a list of indexs for the graph inputs that
+                                         needs to be observed in parent module
+        _output_is_observed(Bool): a boolean variable indicate whether the output of the
+                                   custom module is observed or not
+
+    """
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=True)
 
 def prepare_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training static quantization or
-    qantization aware training, not for public use.
+    r""" Prepare a model for post training static quantization
 
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
       an eval model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
+      qconfig_dict = {
+      # optional, global config
+      "": qconfig?,
+
+      # optional, used for module and function types
+      # could also be split into module_types and function_types if we prefer
+      "object_type": [
+        (torch.nn.Conv2d, qconfig?),
+        (torch.nn.functional.add, qconfig?),
+        ...,
+       ],
+
+      # optional, used for module names
+      "module_name": [
+        ("foo.bar", qconfig?)
+        ...,
+      ],
+
+      # optional, matched in order, first match takes precedence
+      "module_name_regex": [
+        ("foo.*bar.*conv[0-9]+", qconfig?)
+        ...,
+      ],
+      # priority (in increasing order): global, object_type, module_name_regex, module_name
+      # qconfig == None means fusion and quantization should be skipped for anything
+      # matching the rule
+
+      # optional: specify the path for standalone modules
+      # These modules are symbolically traced and quantized as one unit
+      # User should also skip symbolic tracing through these modules
+      # so that the call to the submodule appears as one call_module
+      # node in the forward graph of the GraphModule
+      "standalone_module_name": [
+         "submodule.standalone"
+      ]
+      }
+      `inplace`: flag for carry out model transformations in-place,
+      the original module is mutated
+
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with observer (configured by qconfig_dict), ready for calibration
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.eval()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    calibrate(prepared_model, sample_inference_data)
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
+    assert not graph_module.training, 'prepare_fx only works for models in' + \
+        'eval mode'
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
 def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
     assert not graph_module.training, 'prepare_static_fx only works for models in ' + \
@@ -51,150 +135,73 @@ def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
 def prepare_qat_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for quantization aware training
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
-      a train model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+       a train model
+      `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
+      `inplace`: flag for carry out model transformations in-place,
+       the original module is mutated
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with fake quant modules (configured by qconfig_dict), ready for
+      quantization aware training
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qat_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.train()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qat_qconfig('fbgemm')
+    def train_loop(model, train_data):
+        model.train()
+        for image, target in data_loader:
+            ...
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    train_loop(prepared_model, train_loop)
     """
     assert graph_module.training, 'prepare_qat_fx only works for models in ' + \
         'train mode'
-    return prepare_fx(graph_module, qconfig_dict, inplace)
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
-def prepare_dynamic_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training dynamic quantization
+def _convert_fx(graph_module, inplace, debug, is_standalone_module=False):
+    """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, True)
-
-def _convert_fx(graph_module, inplace, debug, is_dynamic_quant):
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant)
+    return quantizer.convert(graph_module, inplace, debug, is_standalone_module)
 
 def convert_fx(graph_module, inplace=False, debug=False):
     r""" Convert a calibrated or trained model to a quantized model
-    """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False)
-
-convert_static_fx = convert_fx
-convert_qat_fx = convert_fx
-
-def convert_dynamic_fx(graph_module, inplace=False, debug=False):
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True)
-
-def _quantize_fx(model, qconfig_dict, run_fn=None, run_args=None, inplace=False,
-                 debug=False, is_dynamic_quant=False):
-    assert not model.training, 'quantize_fx is only used for post training ' + \
-        'quantization(eval mode), for quantization aware training please use ' + \
-        'prepare_qat_fx and convert_qat_fx.'
-
-    if is_dynamic_quant:
-        model = prepare_dynamic_fx(model, qconfig_dict, inplace)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_dynamic_fx(model, inplace=True, debug=debug)
-    else:
-        assert run_fn, "Must provide calibration function for post training static quantization"
-        assert run_args, "Must provide calibration dataset for post training static quantization"
-        model = prepare_fx(model, qconfig_dict, inplace)
-        run_fn(model, *run_args)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_fx(model, inplace=True, debug=debug)
-
-    return model
-
-
-def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training static quantization
-
-    First it will prepare the model for calibration, then it calls
-    `run_fn` which will run the calibration step, after that we will
-    convert the model to a quantized model.
-
     Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
-        qconfig for that module as value, empty key means the qconfig will be applied
-        to whole model unless it’s overwritten by more specific configurations, the
-        qconfig for each module is either found in the dictionary or fallback to
-         the qconfig of parent module.
-
-        Right now qconfig_dict is the only way to configure how the model is quantized,
-        and it is done in the granularity of module, that is, we only support one type
-        of qconfig for each torch.nn.Module, and the qconfig for sub module will
-        override the qconfig for parent module, empty string means global configuration.
-        `run_fn`: a calibration function for calibrating the prepared model
-        `run_args`: positional arguments for `run_fn`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
+        `graph_module`: A prepared and calibrated/trained model (GraphModule)
+        `inplace`: flag for carry out model transformations in-place,
+        the original module is mutated
         `debug`: flag for producing a debug friendly model (preserve weight attribute)
-
     Return:
-        Quantized TorchSciprt model.
+        A quantized model (GraphModule)
 
     Example:
     ```python
-    import torch
-    from torch.quantization import get_default_qconfig
-    from torch.quantization import quantize_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
+    # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+    quantized_model = convert_fx(prepared_model)
     ```
     """
-    return _quantize_fx(
-        model, qconfig_dict, run_fn, run_args, inplace, debug, is_dynamic_quant=False)
+    return _convert_fx(graph_module, inplace, debug)
 
-def quantize_dynamic_fx(model, qconfig_dict, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training dynamic quantization.
-    Currently only qint8 quantization of torch.nn.Linear is supported.
-
-    Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
-        qconfig for that module as value, please see detailed
-        descriptions in :func:`~torch.quantization.quantize_fx`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
+    r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
+    and convert it to a quantized model
 
+    The inputs will be quantized by parent module, checks `_standalone_module_observed_input_idxs` of
+    input model and will treat these inputs as quantized
+    also will not dequantize the final output
     Return:
-        Quantized TorchSciprt model.
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import per_channel_dynamic_qconfig
-    from torch.quantization import quantize_dynmiac_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_dynamic_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
-    ```
+      A quantized standalone module which accepts quantized input(if needed)
+      and produces quantized output (if needed).
     """
-    return _quantize_fx(
-        model, qconfig_dict, inplace=inplace, debug=debug, is_dynamic_quant=True)
+    return _convert_fx(graph_module, inplace, debug, is_standalone_module=True)
diff --git a/torch/quantization/quantize_jit.py b/torch/quantization/quantize_jit.py
index 130f0297357c..ef6792d521f6 100644
--- a/torch/quantization/quantize_jit.py
+++ b/torch/quantization/quantize_jit.py
@@ -67,7 +67,8 @@ def prepare_jit(model, qconfig_dict, inplace=False):
 def prepare_dynamic_jit(model, qconfig_dict, inplace=False):
     return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC)
 
-def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC):
+def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC,
+                 preserved_attrs=None):
     _check_is_script_module(model)
     model.eval()
     model_c = model._c
@@ -76,18 +77,20 @@ def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC)
         # Moving model parameters to CPU since quantized operators
         # are only supported on CPU right now
         model.cpu()
-        model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type)
+        if preserved_attrs is None:
+            preserved_attrs = []
+        model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type, preserved_attrs)
     if inplace:
         model._reconstruct(model_c)
     else:
         model = wrap_cpp_module(model_c)
     return model
 
-def convert_jit(model, inplace=False, debug=False):
-    return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC)
+def convert_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC, preserved_attrs=preserved_attrs)
 
-def convert_dynamic_jit(model, inplace=False, debug=False):
-    return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC)
+def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC, preserved_attrs=preserved_attrs)
 
 def _quantize_jit(model, qconfig_dict, run_fn=None, run_args=None, inplace=False, debug=False, quant_type=QuantType.STATIC):
     # Always do inplace convert because the Tensor is already
diff --git a/torch/quasirandom.py b/torch/quasirandom.py
index c4b75f4cd5ce..b738dcf7b60c 100644
--- a/torch/quasirandom.py
+++ b/torch/quasirandom.py
@@ -1,4 +1,5 @@
 import torch
+from typing import Optional
 
 
 class SobolEngine(object):
@@ -57,11 +58,10 @@ def __init__(self, dimension, scramble=False, seed=None):
         torch._sobol_engine_initialize_state_(self.sobolstate, self.dimension)
 
         if self.scramble:
+            g: Optional[torch.Generator] = None
             if self.seed is not None:
                 g = torch.Generator()
                 g.manual_seed(self.seed)
-            else:
-                g = None
 
             shift_ints = torch.randint(2, (self.dimension, self.MAXBIT), device=cpu, generator=g)
             self.shift = torch.mv(shift_ints, torch.pow(2, torch.arange(0, self.MAXBIT, device=cpu)))
diff --git a/torch/serialization.py b/torch/serialization.py
index c68c1ff0b60d..1c05767922a8 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -821,7 +821,7 @@ def restore_location(storage, location):
     return restore_location
 
 
-def _load(zip_file, map_location, pickle_module, **pickle_load_args):
+def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickle_load_args):
     restore_location = _get_restore_location(map_location)
 
     loaded_storages = {}
@@ -847,7 +847,7 @@ def persistent_load(saved_id):
         return storage
 
     # Load the data (which may in turn use `persistent_load` to load tensors)
-    data_file = io.BytesIO(zip_file.get_record('data.pkl'))
+    data_file = io.BytesIO(zip_file.get_record(pickle_file))
     unpickler = pickle_module.Unpickler(data_file, **pickle_load_args)
     unpickler.persistent_load = persistent_load
     result = unpickler.load()
diff --git a/torch/tensor.py b/torch/tensor.py
index be79dd5c3cd8..9709c146c815 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -7,6 +7,7 @@
 import warnings
 import weakref
 from torch._C import _add_docstr
+from typing import Any, Dict, Tuple, Union
 from numbers import Number
 import functools
 from typing import Optional
@@ -53,6 +54,8 @@ def __deepcopy__(self, memo):
             else:
                 new_storage = self.storage().__deepcopy__(memo)
                 if self.is_quantized:
+                    # quantizer_params can be different type based on torch attribute
+                    quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[torch.qscheme, Tensor, Tensor, int]]
                     if self.qscheme() == torch.per_tensor_affine:
                         quantizer_params = self.qscheme(), self.q_scale(), self.q_zero_point()
                     elif self.qscheme() in (torch.per_channel_affine, torch.per_channel_affine_float_qparams):
@@ -85,6 +88,7 @@ def __reduce_ex__(self, proto):
         check_serializing_named_tensor(self)
         # See Note [Don't serialize hooks]
         torch.utils.hooks.warn_if_has_hooks(self)
+        backward_hooks: Dict[Any, Any] = OrderedDict()
         # Note: Numpy array is chosen to be the rebuild component for XLA Tensor.
         # We considered a few options:
         # 1. CPU tensor can't be used here.
@@ -96,12 +100,14 @@ def __reduce_ex__(self, proto):
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
         if self.device.type == 'xla':
-            args = (self.cpu().numpy(),
-                    self.dtype,
-                    str(self.device),
-                    self.requires_grad)
-            return (torch._utils._rebuild_xla_tensor, args)
+            arg_xla = (self.cpu().numpy(),
+                       self.dtype,
+                       str(self.device),
+                       self.requires_grad)
+            return (torch._utils._rebuild_xla_tensor, arg_xla)
         if self.is_quantized:
+            # quantizer_params can be different type based on torch attribute
+            quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int]]
             if self.qscheme() == torch.per_tensor_affine:
                 quantizer_params = (torch.per_tensor_affine,
                                     self.q_scale(),
@@ -116,31 +122,31 @@ def __reduce_ex__(self, proto):
                                     self.q_per_channel_axis())
             else:
                 raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}")
-            args = (self.storage(),
-                    self.storage_offset(),
-                    tuple(self.size()),
-                    self.stride(),
-                    quantizer_params,
-                    self.requires_grad,
-                    OrderedDict())
-            return (torch._utils._rebuild_qtensor, args)
+            args_qtensor = (self.storage(),
+                            self.storage_offset(),
+                            tuple(self.size()),
+                            self.stride(),
+                            quantizer_params,
+                            self.requires_grad,
+                            backward_hooks)
+            return (torch._utils._rebuild_qtensor, args_qtensor)
         elif self.is_sparse:
             if self.layout == torch.sparse_coo:
-                args = (self.layout,
-                        (self._indices(),
-                         self._values(),
-                         self.size()))
+                args_sparse = (self.layout,
+                               (self._indices(),
+                                self._values(),
+                                self.size()))
             else:
                 raise NotImplementedError(
                     'sparse tensor __reduce_ex__ for layout `%s`' % (self.layout))
-            return (torch._utils._rebuild_sparse_tensor, args)
+            return (torch._utils._rebuild_sparse_tensor, args_sparse)
         else:
             args = (self.storage(),
                     self.storage_offset(),
                     tuple(self.size()),
                     self.stride(),
                     self.requires_grad,
-                    OrderedDict())  # previously was self._backward_hooks
+                    backward_hooks)  # previously was self._backward_hooks
             return (torch._utils._rebuild_tensor_v2, args)
 
     def __setstate__(self, state):
@@ -512,7 +518,7 @@ def __rdiv__(self, other):
         if self.dtype.is_floating_point or self.dtype.is_complex:
             return self.reciprocal() * other
         else:
-            return (self.double().reciprocal() * other).type_as(self)
+            return self.to(torch.get_default_dtype()).reciprocal() * other
 
     __rtruediv__ = __rdiv__
     __itruediv__ = _C._TensorBase.__idiv__
@@ -528,7 +534,7 @@ def __format__(self, format_spec):
             return self.item().__format__(format_spec)
         return object.__format__(self, format_spec)
 
-    def __ipow__(self, other):
+    def __ipow__(self, other):  # type: ignore[misc]
         relevant_args = (self, other)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and type(other) is not Tensor and has_torch_function(relevant_args):
@@ -546,10 +552,7 @@ def __floordiv__(self, other):
 
     @_wrap_type_error_to_not_implemented
     def __rfloordiv__(self, other):
-        result = other / self
-        if result.dtype.is_floating_point:
-            result = result.trunc()
-        return result
+        return torch.floor_divide(other, self)
 
     __neg__ = _C._TensorBase.neg
 
@@ -652,7 +655,8 @@ def __contains__(self, element):
         if type(self) is not Tensor and has_torch_function(relevant_args):
             return handle_torch_function(Tensor.__contains__, relevant_args, self, element)
         if isinstance(element, (torch.Tensor, Number)):
-            return (element == self).any().item()
+            # type hint doesn't understand the __contains__ result array
+            return (element == self).any().item()  # type: ignore[union-attr]
 
         raise RuntimeError(
             "Tensor.__contains__ only supports Tensor or scalar, but you passed in a %s." %
@@ -669,7 +673,8 @@ def __cuda_array_interface__(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self)  # type: ignore[attr-defined]
 
         # raise AttributeError for unsupported tensors, so that
         # hasattr(cpu_tensor, "__cuda_array_interface__") is False.
@@ -845,7 +850,7 @@ def unflatten(self, dim, sizes):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, namedshape)
+            return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, sizes)
 
         if not sizes:
             raise RuntimeError("unflatten: sizes must be non-empty")
@@ -936,7 +941,8 @@ def grad(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__get__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__get__, relevant_args, self)  # type: ignore[attr-defined]
 
         if self.requires_grad and not hasattr(self, "retains_grad") and not self.is_leaf and self._grad is None:
             warnings.warn("The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "
@@ -951,7 +957,8 @@ def grad(self, new_grad):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad)  # type: ignore[attr-defined]
         self._grad = new_grad
 
     @grad.deleter
@@ -959,7 +966,8 @@ def grad(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__delete__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__delete__, relevant_args, self)  # type: ignore[attr-defined]
         del self._grad
 
     @classmethod
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 015cbd658816..13f65952af24 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -155,8 +155,7 @@ def __init__(self, dev):
             ("norm", pointwise0_fp16, {"p": 1}),
             ("norm", pointwise0_fp16, {"p": 1, "dim": 0}),
             ("cosine_similarity", mat0_fp16 + mat1_fp16),
-            ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8,
-                                                          torch.nn.functional._Reduction.get_enum('mean'))),
+            ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))),
             ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.float16),
                                        torch.tensor([[1, 3, 4]], device=dev, dtype=torch.float16),
                                        torch.tensor([1], device=dev, dtype=torch.int))),
diff --git a/torch/testing/_internal/codegen/random_topo_test.py b/torch/testing/_internal/codegen/random_topo_test.py
index e2823a97f10b..2eed0bad4e43 100644
--- a/torch/testing/_internal/codegen/random_topo_test.py
+++ b/torch/testing/_internal/codegen/random_topo_test.py
@@ -2,6 +2,8 @@
 import numpy as np
 import argparse
 
+from typing import Dict
+
 # debug print
 DEBUG_PRINT = False
 
@@ -71,7 +73,7 @@ def get_root(x, dependency_map):
             return get_root(dependency_map[x], dependency_map)
         else:
             return x
-    d_map = {}
+    d_map: Dict[int, int] = {}
     num_sets = num_tensor
     candidate = list(range(num_tensor))
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 88f6bf11976c..f0e8c40602c0 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -127,6 +127,21 @@ def wrapped(self, device, dtype):
         return wrapped
     return wrapper
 
+
+# This is a wrapper that wraps a test to run it with TF32 turned off.
+# This wrapper is designed to be used when a test uses matmul or convolutions
+# but the purpose of that test is not testing matmul or convolutions.
+# Disabling TF32 will enforce torch.float tensors to be always computed
+# at full precision.
+def with_tf32_off(f):
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with tf32_off():
+            return f(*args, **kwargs)
+
+    return wrapped
+
+
 def _get_torch_cuda_version():
     if torch.version.cuda is None:
         return [0, 0]
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index f8e5b4822bd8..4f36b31a23d0 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -16,7 +16,7 @@
 import torch.distributed as c10d
 
 from functools import partial, reduce
-from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA
 
 class TestSkip(NamedTuple):
     exit_code: int
@@ -130,6 +130,17 @@ def requires_mpi():
         "c10d was not compiled with the MPI backend",
     )
 
+def skip_if_rocm_single_process(func):
+    """Skips a test for ROCm in a single process environment"""
+    func.skip_if_rocm = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        raise unittest.SkipTest("Test skipped for ROCm")
+
+    return wrapper
 
 def skip_if_rocm(func):
     """Skips a test for ROCm"""
@@ -143,10 +154,23 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+def skip_if_win32():
+    return unittest.skipIf(
+        sys.platform == 'win32',
+        "This unit test case is not supportted on Windows platform",
+    )
+
 TIMEOUT_DEFAULT = 100
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 
+def create_device(interface=None):
+    if sys.platform == 'win32' or interface is None:
+        return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
+    else:
+        return c10d.ProcessGroupGloo.create_device(interface=interface)
+
+
 def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
@@ -206,7 +230,7 @@ def initialize_temp_directories(init_method=None):
     if init_method is not None:
         os.environ["INIT_METHOD"] = init_method
     else:
-        os.environ["INIT_METHOD"] = "file://" + os.path.join(
+        os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join(
             init_dir_path, "shared_init_file"
         )
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 643261461fc8..f26e6c75d37e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7,10 +7,12 @@
 from torch._six import inf, istuple
 from torch.autograd import Variable
 
+from typing import List, Tuple, Dict, Any
+
 from torch.testing import \
     (make_non_contiguous, _dispatch_dtypes,
      floating_types, floating_types_and, floating_and_complex_types,
-     floating_and_complex_types_and, all_types_and_complex_and)
+     floating_and_complex_types_and, all_types_and_complex_and, all_types_and)
 from torch.testing._internal.common_device_type import \
     (skipCUDAIfNoMagma, skipCPUIfNoLapack, expectedFailureCUDA,
      expectedAlertNondeterministic, precisionOverride)
@@ -61,8 +63,10 @@ def __init__(self,
                  dtypesIfCUDA=None,  # dtypes this function is expected to work with on CUDA
                  dtypesIfROCM=None,  # dtypes this function is expected to work with on ROCM
                  test_inplace_grad=True,  # whether to gradcheck and gradgradcheck the inplace variant
+                 supports_tensor_out=True,  # whether the op supports the out kwarg, returning a Tensor
                  skips=tuple(),  # information about which tests to skip
                  decorators=None):  # decorators to apply to generated tests
+
         # Validates the dtypes are generated from the dispatch-related functions
         for dtype_list in (dtypes, dtypesIfCPU, dtypesIfCUDA, dtypesIfROCM):
             assert isinstance(dtype_list, (_dispatch_dtypes, type(None)))
@@ -83,6 +87,7 @@ def __init__(self,
         self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None
 
         self.test_inplace_grad = test_inplace_grad
+        self.supports_tensor_out = supports_tensor_out
 
         self.skips = skips
         self.decorators = decorators
@@ -334,7 +339,7 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    ref=np.negative,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
                    dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.half)),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16)),
     UnaryUfuncInfo('sin',
                    ref=np.sin,
                    handles_large_floats=False,
@@ -368,6 +373,9 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                 active_if=(IS_MACOS or IS_WINDOWS)),
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                device_type='cuda', dtypes=[torch.float64],
+                                active_if=TEST_WITH_ROCM),
                    )),
     UnaryUfuncInfo('tanh',
                    ref=np.tanh,
@@ -384,6 +392,11 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    ref=np.exp2,
                    dtypes=floating_types_and(torch.half),
                    dtypesIfCPU=None,
+                   dtypesIfCUDA=None),
+    UnaryUfuncInfo('nan_to_num',
+                   ref=np.nan_to_num,
+                   dtypes=all_types_and(torch.half),
+                   dtypesIfCPU=None,
                    dtypesIfCUDA=None)
 ]
 
@@ -696,6 +709,8 @@ def method_tests():
         ('round', (), NO_ARGS, 'scalar', (True,)),
         ('sign', (S, S, S), NO_ARGS),
         ('sign', (), NO_ARGS, 'scalar'),
+        ('sgn', (S, S, S), NO_ARGS),
+        ('sgn', (), NO_ARGS, 'scalar'),
         ('trunc', (S, S, S), NO_ARGS, '', (True,)),
         ('trunc', (), NO_ARGS, 'scalar', (True,)),
         ('floor', (S, S, S), NO_ARGS, '', (True,)),
@@ -875,6 +890,8 @@ def method_tests():
         ('repeat', (), (2, 3), 'scalar'),
         ('repeat', (2, 2), (3, 2)),
         ('repeat', (2, 2), (1, 3, 1, 2), 'unsqueeze'),
+        ('repeat', (S,), (0, ), 'zero_dim'),
+        ('repeat', (S,), (0, 2), 'zero_dim_multi'),
         ('logcumsumexp', (S, S, S), (0,), 'dim0', (), [0]),
         ('logcumsumexp', (S, S, S), (1,), 'dim1', (), [0]),
         ('logcumsumexp', (), (0,), 'dim0_scalar', (), [0]),
@@ -952,6 +969,7 @@ def method_tests():
         ('addr', (S, M), ((S,), (M,)), 'coef', (), (), (), ident, {'beta': 0.2, 'alpha': 0.6}),
         ('addr', (), ((S,), (M,)), 'broadcast_lhs_coef', (), (), (), ident, {'beta': 0.2, 'alpha': 0.6}),
         ('dot', (L,), ((L,),), '', (True,)),
+        ('vdot', (L,), ((L,),),),
         ('mm', (S, M), ((M, S),), '', (True,)),
         ('bmm', (M, S, M), ((M, M, S),), '', (True,)),
         ('mv', (S, M), ((M,),), '', (True,)),
@@ -1538,7 +1556,7 @@ def _compare_large_trilu_indices(
     (2028, 1, -1)
 ]
 
-tri_large_tests_args = [
+tri_large_tests_args: List[Tuple[int, ...]] = [
     # Large test cases below are deliberately commented out to speed up CI
     # tests and to avoid OOM error. When modifying implementations of
     # tril_indices and triu_indices, please enable these tests and make sure
@@ -1600,9 +1618,9 @@ def unpack_variables(args):
     'reshape',
     'where'  # argument order
 }
-EXCLUDE_GRADCHECK = {
+EXCLUDE_GRADCHECK: Dict[str, Any] = {
 }
-EXCLUDE_GRADGRADCHECK = {
+EXCLUDE_GRADGRADCHECK: Dict[str, Any] = {
 }
 EXCLUDE_GRADGRADCHECK_BY_TEST_NAME = {
     # *det methods uses svd in backward when matrix is not invertible. However,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 4b18ec86fa2f..4e96e2b26062 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -22,6 +22,7 @@
 from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, \
     gradcheck, gradgradcheck
 from torch.autograd import Variable
+from torch.types import _TensorOrTensors
 import torch.backends.cudnn
 
 # tarfile module tries to obtain a file object name in python 3.3
@@ -871,6 +872,36 @@ def smoothl1loss_no_reduce_scalar_test():
         pickle=False)
 
 
+def smoothl1loss_beta_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='SmoothL1Loss_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0.5)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0.5)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0.5),
+        pickle=False)
+
+
+def smoothl1loss_zero_beta_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='SmoothL1Loss_zero_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0),
+        pickle=False)
+
+
 def multilabelmarginloss_0d_no_reduce_test():
     t = torch.zeros(()).long()
     return dict(
@@ -1244,6 +1275,8 @@ def fractional_max_pool3d_test(test_case):
     nlllossNd_no_reduce_ignore_index_test(),
     smoothl1loss_no_reduce_test(),
     smoothl1loss_no_reduce_scalar_test(),
+    smoothl1loss_beta_test(),
+    smoothl1loss_zero_beta_test(),
     multilabelmarginloss_0d_no_reduce_test(),
     multilabelmarginloss_1d_no_reduce_test(),
     multilabelmarginloss_index_neg_test(),
@@ -1601,6 +1634,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(2, 4, 10),
         cudnn=True,
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1620,6 +1654,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1629,6 +1664,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad2',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1638,6 +1674,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad1size1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1647,6 +1684,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad2size1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1657,6 +1695,7 @@ def fractional_max_pool3d_test(test_case):
         desc='zero_batch',
         test_cuda=(not TEST_WITH_ROCM),
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='Conv1d_dilated',
@@ -1664,6 +1703,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).dilation(2)',
         input_size=(2, 4, 10),
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='Conv1d_groups',
@@ -1672,6 +1712,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(2, 4, 6),
         cudnn=True,
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='ConvTranspose1d',
@@ -1702,6 +1743,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='dilated',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='ConvTranspose1d_groups',
@@ -2117,7 +2159,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.005,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2131,6 +2173,17 @@ def fractional_max_pool3d_test(test_case):
         with_tf32=True,
         tf32_precision=0.05,
     ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(2, 3, (1, 1, 1), 1, 0, 1, 1, False),
+        cpp_constructor_args='''torch::nn::Conv3dOptions(2, 3, {2, 3, 4})
+                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
+        input_size=(1, 2, 3, 4, 5),
+        cudnn=True,
+        desc='1x1x1_no_bias',
+        check_with_long_tensor=False,
+        with_tf32=False,
+    ),
     dict(
         module_name='Conv3d',
         constructor_args=(3, 4, 2, 2),
@@ -2140,7 +2193,7 @@ def fractional_max_pool3d_test(test_case):
         desc='stride',
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.005,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2151,7 +2204,7 @@ def fractional_max_pool3d_test(test_case):
         desc='stride_padding',
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.01,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2180,6 +2233,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2)',
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
+        tf32_precision=0.05,
     ),
     dict(
         fullname='Conv3d_dilated_strided',
@@ -2187,6 +2241,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)',
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -2195,6 +2250,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         input_size=(1, 2, 4, 5, 4),
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -2205,6 +2261,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(1, 2, 4, 5, 4),
         desc='dilated',
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='MaxPool3d',
@@ -3488,6 +3545,62 @@ def fractional_max_pool3d_test(test_case):
         skip_double=TEST_WITH_ROCM,
         pickle=False,
     ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 16, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(16)
+                                .dropout(0.0)''',
+        input_size=(2, 3, 4),
+        desc='relu_activation',
+    ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 8, 0.0, 'gelu'),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_size=(2, 3, 4),
+        check_gradgrad=False,
+        desc='gelu_activation',
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='relu_activation',
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0, 'gelu'),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='gelu_activation',
+    ),
+    dict(
+        module_name='Transformer',
+        constructor_args=(4, 2, 2, 2, 8, 0.0, "relu"),
+        cpp_constructor_args='''torch::nn::TransformerOptions()
+                                .d_model(4)
+                                .nhead(2)
+                                .num_encoder_layers(2)
+                                .num_decoder_layers(2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kReLU)''',
+        input_fn=lambda:(torch.rand(3, 3, 4), torch.rand(2, 3, 4), torch.rand(3, 3)),
+        check_gradgrad=False,
+        desc='multilayer_coder'
+    )
 ]
 
 # add conv padding mode tests:
@@ -3599,11 +3712,15 @@ def nll_loss_helper(input, target, weight, ignore_index):
         return losses_tensor
 
 
-def smoothl1loss_reference(input, target, reduction='mean'):
+def smoothl1loss_reference(input, target, reduction='mean', beta=1.0):
     abs_diff = (input - target).abs()
-    ge_one_mask = (abs_diff >= 1).type_as(abs_diff)
-    lt_one_mask = (abs_diff < 1).type_as(abs_diff)
-    output = ge_one_mask * (abs_diff - 0.5) + lt_one_mask * 0.5 * (abs_diff ** 2)
+    ge_beta_mask = (abs_diff >= beta).type_as(abs_diff)
+    lt_beta_mask = (abs_diff < beta).type_as(abs_diff)
+    # when beta <= 0 we should just use l1_loss
+    if beta == 0:
+        output = abs_diff
+    else:
+        output = ge_beta_mask * (abs_diff - 0.5 * beta) + lt_beta_mask * 0.5 * (abs_diff ** 2) / beta
     if reduction == 'mean':
         return output.mean()
     elif reduction == 'sum':
@@ -4101,8 +4218,8 @@ def padding3d_circular(input, pad):
         input_size=(5, 10),
         target_fn=lambda: torch.randn((5, 10), requires_grad=True),
         check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+        reference_fn=lambda i, t, m, b=1.0:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
     ),
     dict(
         module_name='SoftMarginLoss',
@@ -4342,8 +4459,8 @@ def padding3d_circular(input, pad):
         input_size=(),
         target_fn=lambda: torch.randn((), requires_grad=True),
         check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+        reference_fn=lambda i, t, m, b=1.0:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
         desc='scalar',
     ),
     dict(
@@ -4433,7 +4550,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
         # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
         test_cpp_api_parity=False,
         check_jit=False,
@@ -4452,7 +4568,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
     ),
     dict(
         module_name='CTCLoss',
@@ -4468,7 +4583,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
     ),
 ]
 
@@ -4501,7 +4615,7 @@ def _zero_grad_input(self, input):
             for i in input:
                 self._zero_grad_input(i)
 
-    def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+    def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
         output = self._forward(module, input)
         output_size = output.nelement()
 
@@ -4543,7 +4657,7 @@ def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_para
 
         return res
 
-    def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+    def _numerical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
         def fw(input):
             return self._forward(module, input).detach()
 
@@ -4555,19 +4669,20 @@ def fw(input):
             res += torch.cat([get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param], 0),
         return res
 
-    def check_jacobian(self, module, input, jacobian_input=True):
+    def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
         jacobian_parameters = bool(self._get_parameters(module)[0])
         analytical = self._analytical_jacobian(module, input, jacobian_input, jacobian_parameters)
         numerical = self._numerical_jacobian(module, input, jacobian_input, jacobian_parameters)
         analytical_t = list(iter_tensors(analytical))
         numerical_t = list(iter_tensors(numerical))
 
-        # TODO: compare structure
-        if input.numel() != 0:
-            self.assertLessEqual(
-                max(a.add(n, alpha=-1).abs().max() for a, n in zip(analytical_t, numerical_t)),
-                PRECISION
-            )
+        differences = []
+        for a, n in zip(analytical_t, numerical_t):
+            if a.numel() != 0:
+                differences.append(a.add(n, alpha=-1).abs().max())
+            # TODO: compare structure (ensure analytic jacobian has correct shape)
+        if len(differences) > 0:
+            self.assertLessEqual(max(differences), PRECISION)
 
 
 class TestBase(object):
@@ -4690,6 +4805,8 @@ def __call__(self, test_case):
     def noncontiguize(self, obj):
         if isinstance(obj, list):
             return [self.noncontiguize(o) for o in obj]
+        elif isinstance(obj, tuple):
+            return tuple(self.noncontiguize(o) for o in obj)
         tensor = obj
         ndim = tensor.dim()
         # Always making only the last dimension noncontiguous is easy to hide
@@ -4745,7 +4862,8 @@ def test_cuda(self, test_case):
 
         cpu_input = self._get_input()
         type_map = {'torch.DoubleTensor': torch.cuda.FloatTensor}
-        gpu_input = to_gpu(cpu_input, type_map=type_map)
+        cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
+        gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
 
         cpu_module = self.constructor(*self.constructor_args)
         gpu_module = self.constructor(*self.constructor_args).float().cuda()
@@ -4754,12 +4872,12 @@ def test_cuda(self, test_case):
         for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
             gpu_p.data.copy_(cpu_p)
 
-        test_case._zero_grad_input(cpu_input)
-        test_case._zero_grad_input(gpu_input)
+        test_case._zero_grad_input(cpu_input_tuple)
+        test_case._zero_grad_input(gpu_input_tuple)
         test_case._zero_grad_parameters(cpu_module)
         test_case._zero_grad_parameters(gpu_module)
-        cpu_output = test_case._forward(cpu_module, cpu_input)
-        gpu_output = test_case._forward(gpu_module, gpu_input)
+        cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
+        gpu_output = test_case._forward(gpu_module, gpu_input_tuple)
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_output, gpu_output, atol=self.precision, rtol=0)
 
@@ -4767,8 +4885,8 @@ def test_cuda(self, test_case):
         for _ in range(5):
             cpu_gradOutput = cpu_output.clone().normal_()
             gpu_gradOutput = cpu_gradOutput.type('torch.cuda.FloatTensor')
-            cpu_gradInput = test_case._backward(cpu_module, cpu_input, cpu_output, cpu_gradOutput)
-            gpu_gradInput = test_case._backward(gpu_module, gpu_input, gpu_output, gpu_gradOutput)
+            cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
+            gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0)
             for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
@@ -4776,8 +4894,8 @@ def test_cuda(self, test_case):
 
         # Run double-backwards on CPU and GPU and compare results
         if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
-            cpu_output = cpu_module(cpu_input)
-            gpu_output = gpu_module(gpu_input)
+            cpu_output = cpu_module(*cpu_input_tuple)
+            gpu_output = gpu_module(*gpu_input_tuple)
 
             cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
             gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
@@ -4785,12 +4903,12 @@ def test_cuda(self, test_case):
 
             cpu_gradInputs = torch.autograd.grad(
                 cpu_output,
-                (cpu_input,) + tuple(cpu_module.parameters()),
+                cpu_input_tuple + tuple(cpu_module.parameters()),
                 cpu_gradOutput,
                 create_graph=True)
             gpu_gradInputs = torch.autograd.grad(
                 gpu_output,
-                (gpu_input,) + tuple(gpu_module.parameters()),
+                gpu_input_tuple + tuple(gpu_module.parameters()),
                 gpu_gradOutput,
                 create_graph=True)
 
@@ -4804,11 +4922,11 @@ def test_cuda(self, test_case):
             # only on the gradient.
             cpu_gg = torch.autograd.grad(
                 cpu_output.sum() + sum(map(lambda x: x.sum(), cpu_gradInputs)),
-                (cpu_input, cpu_gradOutput) + tuple(cpu_module.parameters()),
+                cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
                 retain_graph=True)
             gpu_gg = torch.autograd.grad(
                 gpu_output.sum() + sum(map(lambda x: x.sum(), gpu_gradInputs)),
-                (gpu_input, gpu_gradOutput) + tuple(gpu_module.parameters()),
+                gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                 retain_graph=True)
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0)
@@ -4816,8 +4934,7 @@ def test_cuda(self, test_case):
                 # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
                 test_case.assertEqualIgnoreType(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
-        self.test_noncontig(test_case, gpu_module, gpu_input)
-
+        self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
 
 class InputVariableMixin(object):
     def _get_input(self):
@@ -4848,12 +4965,14 @@ def __init__(self, *args, **kwargs):
     def _do_test(self, test_case, module, input):
         num_threads = torch.get_num_threads()
         torch.set_num_threads(1)
-        test_case.check_jacobian(module, input, self.jacobian_input)
+        input_tuple = input if isinstance(input, tuple) else (input,)
+        test_case.check_jacobian(module, input_tuple, self.jacobian_input)
         if self.check_gradgrad:
             # could probably unify check_jacobian above with this.
             params = tuple(x for x in module.parameters())
-            _assertGradAndGradgradChecks(test_case,
-                                         lambda x, *args, **kw: test_case._forward(module, x), (input,) + params)
+            num_inputs = len(input_tuple)
+            _assertGradAndGradgradChecks(
+                test_case, lambda *args, **kw: test_case._forward(module, args[:num_inputs]), input_tuple + params)
 
         # check if module can be printed
         module.__repr__()
@@ -4862,6 +4981,11 @@ def _do_test(self, test_case, module, input):
             # check if the inplace variant of the module gives the same result
             # as the out-of-place
 
+            # check_inplace doesn't support multiple input tensors, since we don't have any modules
+            # that modify the inputs in-place and that accept more than one input
+            assert len(input_tuple) == 1
+            input = input_tuple[0]
+
             module_ip = self.constructor(*self.constructor_args, inplace=True)
 
             input_version = input._version
@@ -4881,106 +5005,93 @@ def _do_test(self, test_case, module, input):
             output_ip.backward(grad)
             test_case.assertEqual(input.grad, input_ip.grad)
 
-        if isinstance(input, torch.LongTensor) and TEST_CUDA:
+        def assert_module_parameters_are(tensor_type, device_id=None):
+            for p in module.parameters():
+                test_case.assertIsInstance(p, tensor_type)
+                if device_id is not None:
+                    test_case.assertEqual(p.get_device(), device_id)
+
+        if all(isinstance(t, torch.LongTensor) for t in input_tuple) and TEST_CUDA:
             # check that cuda() moves module parameters to correct GPU device,
             # and that float() casts parameters correctly
-
-            input = input.cuda()
+            input_tuple = tuple(t.cuda() for t in input_tuple)
             module.float().cuda()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                test_case.assertEqual(p.get_device(), 0)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
             if torch.cuda.device_count() > 1:
-                input = input.cuda(1)
+                input_tuple = tuple(t.cuda(1) for t in input_tuple)
                 module.cuda(1)
                 with torch.cuda.device(1):
-                    module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 1)
+                    module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 1)
         else:
             # check that float()/double() casters work correctly
 
             # to float
-            if not isinstance(input, torch.LongTensor):
-                input = input.float()
+            input_tuple = tuple(t.float() if not isinstance(t, torch.LongTensor) else t for t in input_tuple)
             module.float()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.FloatTensor)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.FloatTensor)
 
             # and back to double
-            if not isinstance(input, torch.LongTensor):
-                input = input.double()
+            input_tuple = tuple(t.double() if not isinstance(t, torch.LongTensor) else t for t in input_tuple)
             module.double()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.DoubleTensor)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.DoubleTensor)
 
             if TEST_CUDA and self.should_test_cuda:
                 # check that cuda() moves module parameters to correct GPU device,
                 # and that float() casts parameters correctly
 
                 # to GPU0
-                input = input.float().cuda()
+                input_tuple = tuple(
+                    t.float().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                 module.float().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 # to CPU
-                input = input.cpu()
+                input_tuple = tuple(t.cpu() for t in input_tuple)
                 module.cpu()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.FloatTensor)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.FloatTensor)
 
                 # back to GPU0
-                input = input.cuda()
+                input_tuple = tuple(t.cuda() for t in input_tuple)
                 module.cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 # test that forwards of module runs correctly without cuDNN
                 if self.cudnn:
                     with torch.backends.cudnn.flags(enabled=False):
-                        module(input)
-                        for p in module.parameters():
-                            test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                            test_case.assertEqual(p.get_device(), 0)
+                        module(*input_tuple)
+                        assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 if torch.cuda.device_count() >= 2:
                     # test cross-GPU transfer works
                     # to GPU1
-                    input = input.cuda(1)
+                    input_tuple = tuple(t.cuda(1) for t in input_tuple)
                     module.cuda(1)
                     with torch.cuda.device(1):
-                        module(input)
-                    for p in module.parameters():
-                        test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                        test_case.assertEqual(p.get_device(), 1)
+                        module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.FloatTensor, 1)
 
                 if not self.skip_double:
                     # test double()
-                    input = input.double().cuda()
+                    input_tuple = tuple(
+                        t.double().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                     module.double().cuda()
-                    module(input)
-                    for p in module.parameters():
-                        test_case.assertIsInstance(p, torch.cuda.DoubleTensor)
-                        test_case.assertEqual(p.get_device(), 0)
+                    module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.DoubleTensor, 0)
 
                 # test half()
-                input = input.half().cuda()
+                input_tuple = tuple(
+                    t.half().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                 module.half().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.HalfTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.HalfTensor, 0)
         torch.set_num_threads(num_threads)
 
     def _get_target(self):
@@ -5003,8 +5114,9 @@ def __init__(self, *args, **kwargs):
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
         self.check_half = kwargs.get('check_half', True)
         self.check_bfloat16 = kwargs.get('check_bfloat16', False)
-        self.convert_target = kwargs.get('convert_target', True)
         self.test_cpu = kwargs.get('test_cpu', True)
+        self.with_tf32 = kwargs.get('with_tf32', True)
+        self.tf32_precision = kwargs.get('tf32_precision', 0.001)
 
     def __call__(self, test_case):
         module = self.constructor(*self.constructor_args)
@@ -5042,12 +5154,10 @@ def apply_fn(input1, input2, target, *params):
         if self.check_gradgrad:
             gradgradcheck(apply_fn, inputs)
 
-    def test_cuda(self, test_case, dtype=None, extra_args=None):
+    def test_cuda(self, test_case, dtype, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
-            elif isinstance(obj, torch.Tensor):
-                return obj.to(dtype)
             elif isinstance(obj, tuple):
                 return tuple(convert_dtype(o, dtype, requires_grad) for o in obj)
             else:
@@ -5062,13 +5172,11 @@ def convert_dtype(obj, dtype, requires_grad=False):
         gpu_module = self.constructor(*self.constructor_args)
 
         # Convert input, target and module parameters to dtype
-        if dtype is not None:
-            cpu_input = convert_dtype(cpu_input, dtype, True)
-            # NLLLoss requires target to be LongTensor
-            if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
-                cpu_target = convert_dtype(cpu_target, dtype)
-            cpu_module.type(dtype)
-            gpu_module.type(dtype)
+        cpu_input = convert_dtype(cpu_input, dtype, True)
+        if cpu_target.is_floating_point() or cpu_target.is_complex():
+            cpu_target = convert_dtype(cpu_target, dtype)
+        cpu_module.type(dtype)
+        gpu_module.type(dtype)
 
         # GPU setup
         gpu_input = to_gpu(cpu_input)
@@ -5084,13 +5192,14 @@ def convert_dtype(obj, dtype, requires_grad=False):
 
         cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
         gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
-        # dtype can be None, so set precision in this way instead of a precision map
+        # dtype used to be able to be None, so set precision in this way instead of a precision map
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_output, gpu_output,
                                         atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0)
 
         cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
         gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+        # dtype used to be able to be None, so set precision in this way instead of a precision map
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput,
                                         atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 8339335bd04b..468fd9cfdc81 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -11,7 +11,12 @@
 from torch.testing._internal.common_utils import TestCase
 from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
-    propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig
+    propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \
+    get_default_qat_qconfig
+from torch.quantization import (
+    is_custom_module_class,
+    is_observed_custom_module,
+)
 from torch.quantization.quantization_mappings import (
     get_dynamic_quant_module_mappings,
     get_qconfig_propagation_list,
@@ -24,9 +29,8 @@
 from torch.quantization import (
     QuantType,
     prepare_fx,
-    prepare_dynamic_fx,
+    prepare_qat_fx,
     convert_fx,
-    convert_dynamic_fx,
 )
 
 import copy
@@ -343,14 +347,25 @@ def checkObservers(self, module, propagate_qconfig_list=None):
         """
         if propagate_qconfig_list is None:
             propagate_qconfig_list = get_qconfig_propagation_list()
-        if hasattr(module, 'qconfig') and module.qconfig is not None and \
-           len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
-           and type(module) in propagate_qconfig_list:
+
+        # check if a module is a leaf module, ignoring activation_post_process attribute
+        def is_leaf_module(module):
+            submodule_name_count = 0
+            for name, _ in module.named_children():
+                if name != 'activation_post_process':
+                    submodule_name_count += 1
+            return submodule_name_count == 0
+
+        if (hasattr(module, 'qconfig') and module.qconfig is not None and
+           is_leaf_module(module) and not isinstance(module, torch.nn.Sequential)
+           and type(module) in propagate_qconfig_list) or \
+           is_custom_module_class(type(module)):
             self.assertTrue(hasattr(module, 'activation_post_process'),
                             'module: ' + str(type(module)) + ' do not have observer')
         # we don't need to check observers for child modules of the
         # qat modules
-        if type(module) not in get_qat_module_mappings().values():
+        if type(module) not in get_qat_module_mappings().values() and \
+           not is_observed_custom_module(module):
             for child in module.children():
                 self.checkObservers(child)
 
@@ -614,40 +629,40 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if type(inputs) == list:
             inputs = inputs[0]
         if quant_type == QuantType.QAT:
+            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
             model.train()
+        elif quant_type == QuantType.STATIC:
+            qconfig = get_default_qconfig(torch.backends.quantized.engine)
+            model.eval()
         else:
+            qconfig = default_dynamic_qconfig
             model.eval()
-        original = symbolic_trace(model)
 
-        qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
-        if quant_type == QuantType.DYNAMIC:
-            prepare = prepare_dynamic_fx
-            convert = convert_dynamic_fx
+        original = symbolic_trace(model)
+        if quant_type == QuantType.QAT:
+            prepare = prepare_qat_fx
         else:
             prepare = prepare_fx
-            convert = convert_fx
 
+        qconfig_dict = {'': qconfig}
         prepared = prepare(original, qconfig_dict)
         prepared(*inputs)
-        qgraph = convert(prepared)
-        qgraph_debug = convert(prepared, debug=True)
+        qgraph = convert_fx(prepared)
+        qgraph_debug = convert_fx(prepared, debug=True)
 
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        self.assertEqual((result - result_debug).abs().max(), 0), \
-            'Expecting debug and non-debug option to produce identical result'
-
+        qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()
             print('quant type:', quant_type)
             print('origianl graph module:', type(model))
             self.printGraphModule(original)
             print()
-            print('quantized graph module:', type(qgraph))
-            self.printGraphModule(qgraph)
+            print('quantized graph module:', type(qgraph_to_check))
+            self.printGraphModule(qgraph_to_check)
             print()
-        qgraph_to_check = qgraph_debug if debug else qgraph
         self.checkGraphModuleNodes(
             qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
 
@@ -793,6 +808,15 @@ def forward(self, x):
         x = self.conv(x)
         return x
 
+class ConvTransposeModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
 class AnnotatedConvModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
@@ -807,6 +831,20 @@ def forward(self, x):
         x = self.dequant(x)
         return x
 
+class AnnotatedConvTransposeModel(torch.nn.Module):
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.dequant(x)
+        return x
+
 class ConvBnModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -866,6 +904,17 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
+class LinearModelWithSubmodule(nn.Module):
+    def __init__(self):
+        super(LinearModelWithSubmodule, self).__init__()
+        self.subm = TwoLayerLinearModel()
+        self.fc = nn.Linear(5, 5)
+
+    def forward(self, x):
+        x = self.subm(x)
+        x = self.fc(x)
+        return x
+
 class AnnotatedTwoLayerLinearModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9959551031ff..9c9d27bf195b 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -15,6 +15,7 @@
 from functools import partial
 import inspect
 import io
+import operator
 import argparse
 import unittest
 import warnings
@@ -32,9 +33,9 @@
 import tempfile
 import json
 from urllib.request import urlopen
-import __main__
+import __main__  # type: ignore[import]
 import errno
-from typing import cast, Any, Iterable, Optional
+from typing import cast, Any, Dict, Iterable, Optional
 
 from torch.testing._internal import expecttest
 from torch.testing import \
@@ -53,7 +54,12 @@
 
 torch.backends.disable_global_flags()
 
+FILE_SCHEMA = "file://"
+if sys.platform == 'win32':
+    FILE_SCHEMA = "file:///"
+
 IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
+IS_FBCODE = os.getenv('PYTORCH_TEST_FBCODE') == '1'
 
 class ProfilingMode(Enum):
     LEGACY = 1
@@ -123,8 +129,9 @@ def prof_func_call(*args, **kwargs):
 def prof_meth_call(*args, **kwargs):
     return prof_callable(meth_call, *args, **kwargs)
 
-torch._C.ScriptFunction.__call__ = prof_func_call
-torch._C.ScriptMethod.__call__ = prof_meth_call
+# TODO fix when https://github.com/python/mypy/issues/2427 is address
+torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[assignment]
+torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[assignment]
 
 def _get_test_report_path():
     # allow users to override the test file location. We need this
@@ -140,7 +147,7 @@ def _get_test_report_path():
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
-parser.add_argument('--ge_config', type=str)
+parser.add_argument('--jit_executor', type=str)
 parser.add_argument('--repeat', type=int, default=1)
 parser.add_argument('--test_bailouts', action='store_true')
 parser.add_argument('--save-xml', nargs='?', type=str,
@@ -151,11 +158,11 @@ def _get_test_report_path():
 parser.add_argument('--run-parallel', type=int, default=1)
 
 args, remaining = parser.parse_known_args()
-if args.ge_config == 'legacy':
+if args.jit_executor == 'legacy':
     GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif args.ge_config == 'profiling':
+elif args.jit_executor == 'profiling':
     GRAPH_EXECUTOR = ProfilingMode.PROFILING
-elif args.ge_config == 'simple':
+elif args.jit_executor == 'simple':
     GRAPH_EXECUTOR = ProfilingMode.SIMPLE
 else:
     # infer flags based on the default settings
@@ -272,7 +279,7 @@ def run_tests(argv=UNITTEST_ARGS):
         assert not failed, "Some test shards have failed"
     elif TEST_SAVE_XML is not None:
         # import here so that non-CI doesn't need xmlrunner installed
-        import xmlrunner
+        import xmlrunner  # type: ignore[import]
         test_report_path = TEST_SAVE_XML + LOG_SUFFIX
         os.makedirs(test_report_path, exist_ok=True)
         verbose = '--verbose' in argv or '-v' in argv
@@ -701,11 +708,11 @@ def settings(*args, **kwargs):
 except ImportError:
     print('Fail to import hypothesis in common_utils, tests are not derandomized')
 
-disabled_test_from_issues = None
+disabled_test_from_issues: Optional[Dict[str, Any]] = None
 def check_disabled(test_name):
     global disabled_test_from_issues
     if disabled_test_from_issues is None:
-        disabled_test_from_issues = {}
+        _disabled_test_from_issues: Dict = {}
 
         def read_and_process():
             url = 'https://raw.githubusercontent.com/zdevito/pytorch_disabled_tests/master/result.json'
@@ -716,18 +723,21 @@ def read_and_process():
                 key = 'DISABLED '
                 if title.startswith(key):
                     test_name = title[len(key):].strip()
-                    disabled_test_from_issues[test_name] = item['html_url']
+                    _disabled_test_from_issues[test_name] = item['html_url']
 
         if not IS_SANDCASTLE and os.getenv("PYTORCH_RUN_DISABLED_TESTS", "0") != "1":
             try:
                 read_and_process()
+                disabled_test_from_issues = _disabled_test_from_issues
             except Exception:
                 print("Couldn't download test skip set, leaving all tests enabled...")
+                disabled_test_from_issues = {}
 
-    if test_name in disabled_test_from_issues:
-        raise unittest.SkipTest(
-            "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
-            " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
+    if disabled_test_from_issues is not None:
+        if test_name in disabled_test_from_issues:
+            raise unittest.SkipTest(
+                "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
+                " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
 
 # Acquires the comparison dtype, required since isclose
 # requires both inputs have the same dtype, and isclose is not supported
@@ -876,7 +886,7 @@ def safeCoalesce(self, t):
             self.assertEqual(t._values(), tc._values())
             return tc
 
-        value_map = {}
+        value_map: Dict[Any, Any] = {}
         for idx, val in zip(t._indices().t(), t._values()):
             idx_tup = tuple(idx.tolist())
             if idx_tup in value_map:
@@ -885,11 +895,11 @@ def safeCoalesce(self, t):
                 value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
 
         new_indices = sorted(list(value_map.keys()))
-        new_values = [value_map[idx] for idx in new_indices]
+        _new_values = [value_map[idx] for idx in new_indices]
         if t._values().ndimension() < 2:
-            new_values = t._values().new(new_values)
+            new_values = t._values().new(_new_values)
         else:
-            new_values = torch.stack(new_values)
+            new_values = torch.stack(_new_values)
 
         new_indices = t._indices().new(new_indices).t()
         tg = t.new(new_indices, new_values, t.size())
@@ -1165,8 +1175,8 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *,
         else:
             super().assertEqual(x, y, msg=msg)
 
-    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,
-                       atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:
+    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override]
+                       atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:  # type: ignore[override]
         with self.assertRaises(AssertionError, msg=msg):
             self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)
 
@@ -1225,7 +1235,7 @@ def maybeWarnsRegex(self, category, regex=''):
                     msg = 'Caught unexpected warnings:\n'
                     for w in ws:
                         msg += warnings.formatwarning(
-                            w.message, w.category, w.filename, w.lineno, w.line)
+                            str(w.message), w.category, w.filename, w.lineno, w.line)
                         msg += '\n'
                     self.fail(msg)
 
@@ -1494,6 +1504,14 @@ def random_symmetric_matrix(l, *batches, **kwargs):
     return A
 
 
+def random_hermitian_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
+    A = (A + A.transpose(-2, -1).conj()).div_(2)
+    return A
+
+
 def random_symmetric_psd_matrix(l, *batches, **kwargs):
     dtype = kwargs.get('dtype', torch.double)
     device = kwargs.get('device', 'cpu')
@@ -1604,7 +1622,8 @@ def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
     values = torch.randn(nonzero_elements, dtype=dtype, device=device)
     # ensure that the diagonal dominates
     values *= torch.tensor([-float(i - j)**2 for i, j in zip(*indices)], dtype=dtype, device=device).exp()
-    A = torch.sparse_coo_tensor(indices, values, (rows, columns), device=device)
+    indices_tensor = torch.tensor(indices)
+    A = torch.sparse_coo_tensor(indices_tensor, values, (rows, columns), device=device)
     return A.coalesce()
 
 
@@ -1661,8 +1680,8 @@ def multiply(data, N, i, j, cs, sn, left=True):
         icoords.append(i)
         jcoords.append(j)
         values.append(v)
-    indices = [icoords, jcoords]
-    return torch.sparse_coo_tensor(indices, values, (matrix_size, matrix_size), dtype=dtype, device=device)
+    indices_tensor = torch.tensor([icoords, jcoords])
+    return torch.sparse_coo_tensor(indices_tensor, values, (matrix_size, matrix_size), dtype=dtype, device=device)
 
 
 def do_test_dtypes(self, dtypes, layout, device):
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index b88765211df1..18d7a0417eac 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
 from torch.distributed.rpc import _rref_context_get_debug_info  # type: ignore[attr-defined]
+from torch.testing._internal.common_utils import FILE_SCHEMA
 
 
 if not dist.is_available():
@@ -14,7 +15,26 @@
     sys.exit(0)
 
 
-INIT_METHOD_TEMPLATE = "file://{file_name}"
+INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
+
+
+def single_threaded_process_group_agent(f):
+    """
+    Forces ProcessGroupAgent to use only a single thread in the ThreadPool for
+    sending and processing requests.
+    """
+    @wraps(f)
+    def wrapper(self, *args, **kwargs):
+        backend_type = self.rpc_backend
+        if backend_type == rpc.backend_registry.BackendType["PROCESS_GROUP"]:
+            self.rpc_backend_options = rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                num_send_recv_threads=1,
+            )
+        return_value = f(self, *args, **kwargs)
+        return return_value
+    return wrapper
 
 
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 75d89c33325a..305b0fcb82bf 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -18,8 +18,9 @@
     requires_gloo,
     requires_nccl,
     skip_if_lt_x_gpu,
+    skip_if_rocm,
 )
-from torch.testing._internal.dist_utils import dist_init
+from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -324,14 +325,19 @@ def trainer_name(self, rank):
         # The name has to be consistent with that in 'dist_init' decorator.
         return f"worker{rank}"
 
-    def _remote_worker_process(self):
+    def _remote_worker_process(self, ddp_mode):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
+
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         global shutdown_signal
         with shutdown_signal:
             shutdown_signal.wait()
@@ -345,7 +351,7 @@ def _trainer_process(self, rank: int):
         )
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -362,10 +368,11 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
         gLogger.info("Running the master process...")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
+
         remote_em_rref = rpc.remote(
             self.remote_worker_name(), RemoteEM, args=(NUM_EM_ROW, D_SPARSE)
         )
@@ -400,6 +407,10 @@ def do_test_on_master(
                 )
             )
 
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         training_examples = get_training_examples()
         for _ in range(3):
             futures = []
@@ -454,7 +465,7 @@ def _do_test(self, ddp_mode, simulate_uneven_inputs=False):
         if self.rank == MASTER_RANK:
             self._master_process(ddp_mode, simulate_uneven_inputs)
         elif self.rank == REMOTE_WORKER_RANK:
-            self._remote_worker_process()
+            self._remote_worker_process(ddp_mode)
         elif self.rank in TRAINER_RANKS:
             self._trainer_process(self.rank)
         else:
@@ -499,7 +510,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -566,7 +577,7 @@ def test_ddp_dist_autograd_sparse_grads(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -603,7 +614,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -641,6 +652,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
     @skip_if_lt_x_gpu(NUM_TRAINERS)
     @requires_nccl()
     @dist_init
+    @skip_if_rocm
     def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         # Each trainer uses a different random seed. Otherwise, they are going
         # to have exactly the same initial model parameters, input, and
@@ -649,7 +661,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 85b1d65a06ec..235e88f3c823 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,5 +1,5 @@
 import copy
-import fcntl
+from collections import namedtuple
 import itertools
 import random
 import math
@@ -22,6 +22,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember
+from torch.testing._internal.common_utils import FILE_SCHEMA
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     TEST_SKIPS,
@@ -33,6 +34,7 @@
     skip_if_lt_x_gpu,
     skip_if_no_gpu,
     require_n_gpus_for_nccl_backend,
+    requires_nccl_version,
 )
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
@@ -43,6 +45,10 @@
 except ImportError:
     HAS_TORCHVISION = False
 
+if sys.platform == 'win32':
+    import msvcrt
+else:
+    import fcntl
 
 class Foo:
     def __init__(self, x):
@@ -191,17 +197,27 @@ def _lock():
     lockfile = os.path.join(TEMP_DIR, "lockfile")
     with open(lockfile, "w") as lf:
         try:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
-            yield
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1)
+                yield
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+                yield
         finally:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
             lf.close()
 
 
-def _build_tensor(size, value=None, dtype=torch.float):
+def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     if value is None:
         value = size
-    return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    if device_id is None:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    else:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
 
 
 def _build_multidim_tensor(dim, dim_size, value=None):
@@ -270,10 +286,12 @@ def tearDown(self):
 
     @property
     def init_method(self):
-        return "file://{file_name}".format(file_name=self.file_name)
+        return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name)
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
+        if BACKEND == 'nccl' and not torch.cuda.is_available():
+            sys.exit(TEST_SKIPS['no_cuda'].exit_code)
         self = cls(test_name)
         self.rank = rank
         self.file_name = file_name
@@ -571,6 +589,182 @@ def test_backend_group(self):
         def test_backend_full_group(self):
             self._test_group_override_backend(self._init_full_group_test)
 
+        # NCCL Batch SEND RECV
+        @skip_if_no_gpu
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            p2p_op_list = []
+
+            for val in ["1", "0"]:
+                os.environ["NCCL_BLOCKING_WAIT"] = val
+                for src in range(0, dist.get_world_size()):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    recv_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                    p2p_op_list.append(recv_op)
+                    send_op = dist._P2POp(dist.isend, send_tensor, src)
+                    p2p_op_list.append(send_op)
+
+                reqs = dist._batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU with provided tags
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo_tags(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src, tag=src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src, tag=rank)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # NCCL Batch SEND RECV Tensor Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_tensor_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "Tensors must be CUDA and dense"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op = dist._P2POp(dist.isend, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Op Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``op``"
+                ):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    send_op = dist._P2POp(dist.broadcast, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV p2p_op_list Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_list_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``p2p_op_list``"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    req = dist._batch_isend_irecv([1, 2])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Mixed Backend Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_mixed_backend_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            group_gloo = dist.new_group(ranks=[0, 1], backend="gloo")
+            group_nccl = dist.new_group(ranks=[0, 1], backend="nccl")
+            if rank == 0:
+                with self.assertRaisesRegex(
+                    RuntimeError, "All groups need to use the same backend"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op_gloo = dist._P2POp(dist.isend, send_tensor, 1, group_gloo)
+                    send_op_nccl = dist._P2POp(dist.isend, send_tensor, 1, group_nccl)
+                    req = dist._batch_isend_irecv([send_op_gloo, send_op_nccl])
+                    req.wait()
+
+        # NCCL SEND RECV
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @skip_if_no_gpu
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_send_recv_nccl(self):
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+
+            tensor = _build_tensor(rank + 1, device_id=device_id)
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    # Send mode
+                    for dst in range(0, dist.get_world_size()):
+                        if dst == rank:
+                            continue
+                        dist.send(tensor, dst)
+                else:
+                    # Recv mode
+                    expected_tensor = _build_tensor(src + 1)
+                    output_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    dist.recv(output_tensor, src)
+                    self.assertEqual(output_tensor, expected_tensor)
+
+            self._barrier()
+
         # SEND RECV
         @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
         def test_send_recv(self):
@@ -740,6 +934,7 @@ def test_broadcast(self):
             "Only Gloo and Nccl backend supports CUDA allReduce",
         )
         @skip_if_no_gpu
+        @skip_if_rocm
         def test_broadcast_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -938,6 +1133,7 @@ def test_reduce_full_group_max(self):
 
         @skip_if_no_gpu
         @require_backend({"gloo", "nccl"})
+        @skip_if_rocm
         def test_all_reduce_result_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -1904,6 +2100,7 @@ def _test_barrier_helper(
 
         @skip_if_no_gpu
         @unittest.skipIf(BACKEND == "mpi", "MPI doesn't supports GPU barrier")
+        @skip_if_rocm
         def test_barrier_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -2096,6 +2293,14 @@ def _model_step(self, model):
                         param += param.grad
                     param.grad = None
 
+        def _model_step_with_zero_grad(self, model):
+            for param in model.parameters():
+                if param.grad is not None:
+                    with torch.no_grad():
+                        param += param.grad
+                    param.grad.requires_grad_(False)
+                    param.grad.zero_()
+
         def _prepare_dummy_data(self, local_bs):
             # global_bs for DDP should be divisible by WORLD_SIZE
             world_size = int(os.environ["WORLD_SIZE"])
@@ -2118,7 +2323,8 @@ def _assert_equal_param(self, param_gpu, param_DDP):
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_5iter(
-            self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, offset=None, world_size=0
+            self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save,
+            offset=None, world_size=0, zero_grad=False
         ):
             for idx in range(5):
                 # single cpu/gpu training
@@ -2137,8 +2343,12 @@ def _test_DDP_5iter(
                 )
 
                 # Update weights and run a second iteration to shake out errors
-                self._model_step(model_base)
-                self._model_step(model_DDP)
+                if zero_grad:
+                    self._model_step_with_zero_grad(model_base)
+                    self._model_step_with_zero_grad(model_DDP)
+                else:
+                    self._model_step(model_base)
+                    self._model_step(model_DDP)
                 self._assert_equal_param(
                     list(model_base.parameters()), list(model_DDP.module.parameters())
                 )
@@ -2149,8 +2359,13 @@ def _test_DDP_5iter(
                 # save the model in the middle and reload
                 if test_save and idx == 2 and INIT_METHOD.startswith("file://"):
                     with tempfile.NamedTemporaryFile() as tmp:
-                        torch.save(model_DDP, tmp.name)
-                        model_DDP = torch.load(tmp.name)
+                        if sys.platform == 'win32':
+                            torch.save(model_DDP, tmp)
+                            tmp.seek(0)
+                            model_DDP = torch.load(tmp)
+                        else:
+                            torch.save(model_DDP, tmp.name)
+                            model_DDP = torch.load(tmp.name)
 
             with tempfile.TemporaryFile() as tmp_file:
                 torch.save(model_DDP, tmp_file)
@@ -2159,7 +2374,7 @@ def _test_DDP_5iter(
             for k in model_DDP.state_dict():
                 self.assertEqual(model_DDP.state_dict()[k], saved_model.state_dict()[k])
 
-        def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
+        def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gradient_as_bucket_view=False):
             # Run a simple end to end DDP model, use result of single node model
             # as baseline
 
@@ -2174,13 +2389,18 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
             model_DDP = copy.deepcopy(model)
             model_DDP.cuda(gpu_subset[0])
             model_DDP = nn.parallel.DistributedDataParallel(
-                model_DDP, device_ids=gpu_subset
+                model_DDP, device_ids=gpu_subset, gradient_as_bucket_view=gradient_as_bucket_view
             )
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # dummy data initialization
             local_bs = len(gpu_subset)
@@ -2196,14 +2416,11 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
                 local_bs,
                 rank,
                 global_bs,
-                True
+                True,
             )
             self._barrier()
 
-        @unittest.skipIf(
-            BACKEND == "nccl", "nccl does not support DDP on CPU models"
-        )
-        def test_DistributedDataParallelCPU(self):
+        def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False):
             # Run a simple end to end DDP-CPU model, use result of single node
             # model as baseline
             group, group_id, rank = self._init_global_test()
@@ -2213,7 +2430,8 @@ def test_DistributedDataParallelCPU(self):
 
             # DDP-CPU training setup
             model_DDP = copy.deepcopy(model_base)
-            model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_DDP, gradient_as_bucket_view=gradient_as_bucket_view)
 
             # dummy data initialization
             local_bs = 2
@@ -2221,10 +2439,22 @@ def test_DistributedDataParallelCPU(self):
 
             # check two model parameters over 5 iterations
             self._test_DDP_5iter(
-                model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False
+                model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False, zero_grad=True
             )
             self._barrier()
 
+        @unittest.skipIf(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU(self):
+            self._test_DistributedDataParallelCPU()
+
+        @unittest.skipIf(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU_grad_is_view(self):
+            self._test_DistributedDataParallelCPU(gradient_as_bucket_view=True)
+
         @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
                          "Only Nccl & Gloo backend support DistributedDataParallel")
         def test_DistributedDataParallel_requires_grad(self):
@@ -2239,7 +2469,7 @@ def test_DistributedDataParallel_requires_grad(self):
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         @skip_if_rocm
         def test_DistributedDataParallel_non_default_stream(self):
-            stream = torch.cuda.Stream()
+            stream = torch.cuda.Stream(self.rank)
             rank = self.rank
             with torch.cuda.stream(stream):
                 net = torch.nn.parallel.DistributedDataParallel(
@@ -2288,6 +2518,25 @@ def test_DistributedDataParallel(self):
             gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
             self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
 
+        @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
+                         "Only Nccl & Gloo backend support DistributedDataParallel")
+        @skip_if_no_gpu
+        @skip_if_rocm
+        def test_DistributedDataParallel_with_grad_is_view(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = self._init_multigpu_helper()
+            gpus = list(rank_to_GPU[rank])
+            self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, gradient_as_bucket_view=True)
+
+            # test output_device
+            self._test_DistributedDataParallel(
+                gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True)
+
+            # test device_ids
+            gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+            self._test_DistributedDataParallel(
+                gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True)
+
         def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs, global_bs, offset, output_device=None):
             # Run a simple end to end DDP model, use result of single node model
             # as baseline
@@ -2308,8 +2557,13 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # data initialization
             input_cpu = torch.randn(global_bs, 2)
@@ -2952,7 +3206,7 @@ def _run_uneven_inputs_test(
             rank = self.rank
             sync_interval = test_case.sync_interval
             # Ensure all outsanding GPU work is comlete so this test runs independently.
-            torch.cuda.synchronize()
+            dist.barrier()
             # Bucket_cap_mb is intentionally low to test allreduce scheduling when
             # there are many buckets.
             net = torch.nn.parallel.DistributedDataParallel(
@@ -3262,3 +3516,135 @@ def test_broadcast_object_list(self):
                 self.assertNotEqual(objects, collectives_object_test_list)
             dist.broadcast_object_list(objects, src=0)
             self.assertEqual(objects, collectives_object_test_list)
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_ignore_params_arg(self):
+            class TestModel(nn.Module):
+                def __init__(self, rank):
+                    self.rank = rank
+                    super(TestModel, self).__init__()
+                    self.fc1 = nn.Linear(1, 1, bias=False)
+                    # Proxy that will be materialized to another architecture later.
+                    # (after wrapping model with DDP)
+                    if self.rank == 0:
+                        self.fc2 = nn.Linear(1, 10, bias=False)
+                    else:
+                        self.fc2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    x = self.fc1(x)
+                    x = self.fc2(x)
+                    return x
+
+            device_id = self.rank
+            # Ensure the test works for both find_unused_parameter and broadcast_buffer settings.
+            for (find_unused, broadcast_buffers) in itertools.product([False, True], [False, True]):
+                model = TestModel(self.rank).float().to(device_id)
+                # Note that the model can have different shape buffers if we pass
+                # them in to be ignored as well.
+                model.fc2.register_buffer(
+                    "ignore_buffer", torch.zeros(5 + self.rank, device=self.rank)
+                )
+                proxy_params = list(model.fc2.parameters())
+                proxy_buffers = list(model.fc2.buffers())
+                model_fc2_name = [
+                    module_name
+                    for module_name, module in model.named_modules()
+                    if module is model.fc2
+                ][0]
+                proxy_param_names = [
+                    f"{model_fc2_name}.{param_name}"
+                    for param_name, _ in model.fc2.named_parameters()
+                ]
+                proxy_buffer_names = [
+                    f"{model_fc2_name}.{buf_name}"
+                    for buf_name, _ in model.fc2.named_buffers()
+                ]
+                # Specify that we should ignore proxy_params since it will be
+                # materialized later.
+                torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                    model, proxy_param_names + proxy_buffer_names
+                )
+                ddp = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[device_id],
+                    find_unused_parameters=find_unused,
+                    broadcast_buffers=broadcast_buffers,
+                )
+                # Materialize new params. These are not registered in DDP and thus
+                # don't have autograd hooks installed on them.
+                ddp.module.fc2 = nn.Linear(1, 1, bias=False).to(device_id)
+                # local model with the new materialized parameters.
+                local_model = copy.deepcopy(ddp.module).cuda(self.rank)
+
+                inp = torch.ones(1, dtype=torch.float).to(device_id) * (self.rank + 1)
+                for i in range(6):
+                    ddp(inp).sum().backward()
+                    local_model(inp).sum().backward()
+                    # materialized param grad is not touched by DDP, so its grad should
+                    # be the same as if running locally.
+                    for materialized_param, local_param in zip(
+                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                    ):
+                        self.assertEqual(materialized_param.grad, local_param.grad)
+
+                    # fc1 parameter grad should still be different, due to allreduce.
+                    for synced_param, local_param in zip(
+                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                    ):
+                        self.assertFalse(synced_param.grad == local_param.grad)
+
+                    # Proxy module grad should not be touched
+                    for proxy_param in proxy_params:
+                        self.assertTrue(proxy_param.grad is None)
+
+                # Synchronize since we run multiple iterations of this test, to
+                # isolate failure hangs.
+                torch.cuda.synchronize(device=self.rank)
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_namedtuple(self):
+            expected_fields = ("a", "b")
+            TestNamedTupleInput_0 = namedtuple("NamedTuple", expected_fields)
+
+            batch = 5
+            dim = 10
+
+            class TestNamedTupleInput_1(NamedTuple):
+                a: torch.tensor
+                b: torch.tensor
+
+            a = torch.rand(batch, dim, device=self.rank)
+            b = torch.rand(batch, dim, device=self.rank)
+
+            class NamedTupleModule(torch.nn.Module):
+                def __init__(_self):  # noqa
+                    super().__init__()
+                    _self.lin = nn.Linear(10, 1)
+
+                def forward(_self, input, expected_type):  # noqa
+                    # Without NamedTuple support, this would be of type tuple.
+                    self.assertTrue(
+                        isinstance(input, expected_type),
+                        f"Expected type {expected_type} but got {type(input)}",
+                    )
+                    self.assertEqual(input._fields, expected_fields)
+                    self.assertEqual(a, input.a)
+                    self.assertEqual(b, input.b)
+                    return _self.lin(torch.mul(input.a, input.b))
+
+            model = torch.nn.parallel.DistributedDataParallel(
+                NamedTupleModule().cuda(self.rank), device_ids=[self.rank]
+            )
+            inp = TestNamedTupleInput_0(a, b)
+            # The following would fail if DDP does not propagate NamedTuples correctly.
+            model(inp, type(inp))
+
+            inp = TestNamedTupleInput_1(a, b)
+            model(inp, type(inp))
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index 3754aa014ad2..b111ff614608 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -198,3 +198,66 @@ def test_dist_optim(self):
             # ensure local equals remote
             self.assertEqual(new_w1, module1.get_w())
             self.assertEqual(new_w2, module2.get_w())
+
+
+    @dist_init
+    def test_dist_optim_functional(self):
+        # local version
+        module1 = MyModule()
+        module2 = MyModule()
+        params = [module1.get_w(), module2.get_w()]
+        local_optim = optim.Adagrad(params, lr=0.05)
+
+        old_w1 = module1.w.clone().detach()
+        old_w2 = module2.w.clone().detach()
+
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        output1 = module1.forward(t2)
+        output2 = module2.forward(output1)
+        loss = torch.add(output2, t1).sum()
+
+        loss.backward()
+        local_optim.step()
+
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        old_w1_remote = remote_param1.to_here()
+
+        # sanity check: local and remote initial weights should match
+        self.assertEqual(old_w1, remote_param1.to_here())
+        self.assertEqual(old_w2, remote_param2.to_here())
+
+        dist_optim = DistributedOptimizer(
+            optim.Adagrad, [remote_param1, remote_param2], lr=0.05
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
+            output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait())
+            loss = torch.add(output2.wait(), t1)
+
+            dist_autograd.backward(context_id, [loss.sum()])
+            dist_optim.step(context_id)
+
+            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
+            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()
+
+            # ensure optimizer changed weights
+            self.assertNotEqual(old_w1, new_w1)
+            self.assertNotEqual(old_w2, new_w2)
+            # ensure local equals remote
+            self.assertEqual(new_w1, module1.get_w())
+            self.assertEqual(new_w2, module2.get_w())
diff --git a/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
index 893e5b8e17b0..3a4930476345 100644
--- a/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
@@ -13,12 +13,19 @@ def rpc_backend(self):
 
     @property
     def rpc_backend_options(self):
-        return rpc.backend_registry.construct_rpc_backend_options(
-            self.rpc_backend,
-            init_method=self.init_method,
-            # Some tests need additional threads (ex: test_trainer_ps)
-            num_send_recv_threads=8,
-        )
+        try:
+            return self._rpc_backend_options
+        except AttributeError:
+            return rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                # Some tests need additional threads (ex: test_trainer_ps)
+                num_send_recv_threads=8,
+            )
+
+    @rpc_backend_options.setter
+    def rpc_backend_options(self, new_rpc_backend_options):
+        self._rpc_backend_options = new_rpc_backend_options
 
     def get_shutdown_error_regex(self):
         error_regexes = [
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index f469dd32ea04..896bd1cc78db 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1,6 +1,7 @@
 import concurrent.futures
 import contextlib
 import json
+import logging
 import sys
 from threading import Lock
 import time
@@ -31,6 +32,7 @@
     wait_until_pending_futures_and_users_flushed,
     wait_until_owners_and_forks_on_rank,
     worker_name,
+    single_threaded_process_group_agent,
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
@@ -68,6 +70,9 @@ def udf_with_torch_ops(device=-1, use_record_function=False):
     "aten::sigmoid",
 ]
 
+# Remote operations are prefixed with the following string for RPC profiling.
+REMOTE_OP_STR = "#remote_op: "
+
 
 VALUE_FUTURE = concurrent.futures.Future()
 DONE_FUTURE = concurrent.futures.Future()
@@ -130,6 +135,19 @@ def set(self, val):
         self.t = val
 
 
+class SlowPickleClass:
+    def __init__(self, t):
+        self.t = t
+
+    def __getstate__(self):
+        time.sleep(self.t)
+        return (self.t, )
+
+    def __setstate__(self, obj):
+        self.t = obj[0]
+        time.sleep(self.t)
+
+
 class MyClass:
     def __init__(self, a):
         self.a = a
@@ -372,6 +390,19 @@ def async_wrong_type():
 def async_add(to, x, y):
     return rpc.rpc_async(to, torch.add, args=(x, y))
 
+
+def slow_add(x, y, device="cpu"):
+    time.sleep(1)
+    x = x.to(device)
+    y = y.to(device)
+    return torch.add(x, y).cpu()
+
+
+@rpc.functions.async_execution
+def slow_async_add(to, x, y, device="cpu"):
+    return rpc.rpc_async(to, slow_add, args=(x, y, device))
+
+
 @rpc.functions.async_execution
 def async_add_with_future_ctor(to, x, y, z):
     fut = torch.futures.Future()
@@ -459,6 +490,14 @@ def return_future():
     return torch.futures.Future()
 
 
+class FooBackendOptions(rpc.RpcBackendOptions):
+    def __init__(self, init_method):
+        # Must call the __init__ of the superclass (and do so directly,
+        # without using super()) because... pybind.
+        rpc.RpcBackendOptions.__init__(self)
+        self.init_method = init_method
+
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -922,6 +961,20 @@ def test_all_gather(self):
 
         self.assertEqual(expected, results)
 
+    @dist_init
+    def test_all_gather_timeout(self):
+        rpc._set_rpc_timeout(0.1)
+
+        if self.rank == 0:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "timed out in _all_gather after 0\\.10 seconds"
+            ):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+        else:
+            with self.assertRaisesRegex(RuntimeError, "timeout.*100 ms"):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+
     @dist_init
     def test_graceful_shutdown_with_uneven_workload(self):
         """Test graceful termination."""
@@ -1003,6 +1056,33 @@ def check_profiling_info(self, self_worker_name, dst_worker_name, func, rpc_even
         self.assertTrue(rpc_exec_mode.value in rpc_event.name)
         self.assertEqual(rpc_event.count, 1)
 
+    @dist_init
+    def test_profiler_rpc_record_shapes(self):
+        if self.rank != 1:
+            return
+        dst = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst)
+        t1, t2 = torch.ones(100), torch.ones(100)
+        with torch.autograd.profiler.profile(record_shapes=True) as prof:
+            rpc.rpc_sync(dst_worker, torch.add, args=(t1, t2))
+
+        function_events = prof.function_events
+        remote_events = [event for event in function_events if event.is_remote]
+        remote_add_event = [
+            event for event in remote_events if "aten::add" in event.name
+        ][0]
+        remote_add_input_shapes = remote_add_event.input_shapes
+        # Run profiler on equivalent local op and validate shapes are the same.
+        with torch.autograd.profiler.profile(record_shapes=True) as prof:
+            torch.add(t1, t2)
+
+        local_function_events = prof.function_events
+        local_add_event = [
+            event for event in local_function_events if "aten::add" in event.name
+        ][0]
+        local_add_input_shapes = local_add_event.input_shapes
+        self.assertEqual(remote_add_input_shapes, local_add_input_shapes)
+
     @dist_init
     def test_profiler_rpc_memory(self):
         if self.rank != 1:
@@ -1044,6 +1124,9 @@ def test_profiler_remote_cuda(self):
             fut1.wait()
             fut2.wait()
 
+        def get_name(event):
+            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
+
         function_events = p.function_events
         for event in function_events:
             if event.is_async:
@@ -1054,23 +1137,18 @@ def test_profiler_remote_cuda(self):
                 if event.node_id == 1:
                     continue
                 self.assertTrue(event.node_id in [dst_cuda_0, dst_cuda_1])
-                self.assertGreater(event.cuda_time_total, 0)
-                self.assertEqual(1, len(event.kernels))
-                kernel = event.kernels[0]
-                if event.node_id == dst_cuda_0:
-                    self.assertEqual(kernel.device, 0)
-                if event.node_id == dst_cuda_1:
-                    self.assertEqual(kernel.device, 1)
-
-                self.assertGreater(event.cuda_time, 0)
+                if get_name(event) in EXPECTED_REMOTE_EVENTS:
+                    self.assertGreater(event.cuda_time_total, 0)
+                    self.assertEqual(1, len(event.kernels))
+                    kernel = event.kernels[0]
+                    if event.node_id == dst_cuda_0:
+                        self.assertEqual(kernel.device, 0)
+                    if event.node_id == dst_cuda_1:
+                        self.assertEqual(kernel.device, 1)
+                    self.assertGreater(event.cuda_time, 0)
 
         # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
         # events.
-        REMOTE_OP_STR = "#remote_op: "
-
-        def get_name(event):
-            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
-
         remote_events = [event for event in function_events if event.is_remote]
         remote_event_names = [get_name(event) for event in remote_events if get_name(event) in EXPECTED_REMOTE_EVENTS]
         self.assertEqual(set(remote_event_names), set(EXPECTED_REMOTE_EVENTS))
@@ -1165,8 +1243,7 @@ def rpc_with_profiling(dst_worker):
                 for fut in futs:
                     fut.result()
 
-    @dist_init
-    def test_profiler_remote_events_profiled(self):
+    def _run_test_profiler_remote_events_profiled(self):
         # Tests that we can successfully invoke the profiler on a remote node,
         # and collect the remote events back in the local profiler.
         if self.rank != 1:
@@ -1199,7 +1276,6 @@ def test_profiler_remote_events_profiled(self):
             )
 
             for expected_remote_event_name in EXPECTED_REMOTE_EVENTS:
-                REMOTE_OP_STR = "#remote_op: "
                 expected_key = rpc_profiling_key + REMOTE_OP_STR + expected_remote_event_name
                 self.assertTrue(expected_key in remote_events)
                 remote_event = remote_events[expected_key]
@@ -1222,6 +1298,15 @@ def convert_remote_to_local(event_name):
             ]
             self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS)
 
+    @dist_init
+    def test_profiler_remote_events_profiled(self):
+        self._run_test_profiler_remote_events_profiled()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_remote_events_profiled_single_threaded(self):
+        self._run_test_profiler_remote_events_profiled()
+
     def run_profiling_workload(self, dst):
         fut = rpc.rpc_async(
             worker_name(dst),
@@ -1233,6 +1318,67 @@ def run_profiling_workload(self, dst):
         )
         fut.wait()
 
+    def _run_rpc_profiling_async_function(self, device="cpu"):
+        if self.rank != 1:
+            return
+
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+        x = torch.ones(2)
+        y = torch.ones(2)
+        with torch.autograd.profiler.profile() as prof:
+            ret = rpc.rpc_async(
+                dst1, slow_async_add, args=(dst2, x, y, device), timeout=20
+            )
+            out = ret.wait()
+
+        function_events = prof.function_events
+        # slow_async_add resulted in an RPC from dst1 -> dst2, so this should be
+        # recorded.
+        key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_async_add.__qualname__, worker_name(self.rank), dst1
+        )
+
+        nested_rpc_key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_add.__qualname__, dst1, dst2
+        )
+        expected_key = key_prefix + REMOTE_OP_STR + nested_rpc_key_prefix
+        remote_events = [event for event in function_events if event.is_remote]
+        rpc_remote_event = [
+            event for event in remote_events if event.name == expected_key
+        ]
+        self.assertEqual(1, len(rpc_remote_event))
+        rpc_remote_event = rpc_remote_event[0]
+        self.assertEqual(rpc_remote_event.node_id, (self.rank + 1) % self.world_size)
+        # slow_async_add's RPC does an add on dst2, which should be reflected as well.
+        remote_add_key = (
+            expected_key + REMOTE_OP_STR + torch.jit._builtins._find_builtin(torch.add)
+        )
+        remote_add_event = [
+            event for event in remote_events if event.name == remote_add_key
+        ]
+        self.assertEqual(1, len(remote_add_event))
+        remote_add_event = remote_add_event[0]
+        # Validate that node_id is dst2.
+        self.assertEqual(remote_add_event.node_id, (self.rank + 2) % self.world_size)
+
+    @dist_init
+    def test_rpc_profiling_async_function(self):
+        initialize_pg(self.init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_rpc_profiling_async_function_single_threaded(self):
+        initialize_pg(self.init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
     @dist_init
     def test_rpc_profiling_remote_record_function(self):
         # test that functions run over RPC with record_function show the expected
@@ -1266,26 +1412,46 @@ def test_rpc_profiling_remote_record_function(self):
                 "aten::zero_",
                 "aten::fill_",
             ]
-            remote_ops_time = sum(
-                evt.cpu_time_total
-                for evt in remaining_remote_events
-                if not any(
-                    [
-                        rf_entry_event in evt.name
-                        for rf_entry_event in remote_events_denylist
-                    ]
-                )
-            )
-            self.assertGreaterEqual(
-                record_function_remote_event.cpu_time_total, remote_ops_time
-            )
+
+            REMOTE_OP_STR = "#remote_op: "
+
+            def convert_remote_to_local(event_name):
+                remote_op_key = REMOTE_OP_STR
+                return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+
+            # Ideally, we should validate that the sum of remote operations within
+            # record_function are less than record_function's CPU time. However,
+            # there is a known bug in profiling
+            # (https://github.com/pytorch/pytorch/issues/45160) due to which we
+            # can't do this. So, we just validate they are child events.
+            prof.key_averages()
+
+            # cpu_children only returns direct children, so here we get all
+            # children recursively.
+            def get_cpu_children(event):
+                if not event.cpu_children:
+                    return []
+                cpu_children = event.cpu_children
+                for e in event.cpu_children:
+                    cpu_children.extend(get_cpu_children(e))
+                return cpu_children
+
+            record_function_children_names = [
+                convert_remote_to_local(c.name)
+                for c in get_cpu_children(record_function_remote_event)
+            ]
+            for evt in remaining_remote_events:
+                local_name = convert_remote_to_local(evt.name)
+                if local_name not in remote_events_denylist:
+                    self.assertTrue(
+                        local_name in record_function_children_names,
+                        f"{local_name} not in {record_function_children_names}",
+                    )
 
     def validate_profiling_workload(self, dst, prof):
-        REMOTE_OP_STR = "#remote_op: "
 
         def convert_remote_to_local(event_name):
-            remote_op_key = REMOTE_OP_STR
-            return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+            return event_name[event_name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR) :]
 
         events = prof.function_events
         remote_events = {
@@ -1304,8 +1470,7 @@ def convert_remote_to_local(event_name):
             RPCExecMode.ASYNC,
         )
 
-    @dist_init
-    def test_profiler_with_autograd_context(self):
+    def _run_test_profiler_with_autograd_context(self):
         dst = (self.rank + 1) % self.world_size
         if self.rank == 1:
             # Cases where we can double wrap messages with profiling information and autograd info.
@@ -1323,6 +1488,15 @@ def test_profiler_with_autograd_context(self):
 
             self.validate_profiling_workload(dst, prof)
 
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_autograd_context_single_threaded(self):
+        self._run_test_profiler_with_autograd_context()
+
+    @dist_init
+    def test_profiler_with_autograd_context(self):
+        self._run_test_profiler_with_autograd_context()
+
     def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function=False, dst=None):
         dst = dst if dst is not None else (self.rank + 1) % self.world_size
 
@@ -1380,14 +1554,21 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                 rpc_event_idx = next(i for i, event in enumerate(events) if rpc_exec_mode.value in event.name)
                 self.assertLess(foo_event_ix, rpc_event_idx)
 
-    @dist_init
-    def test_profiler_with_sync_rpc_udf(self):
+    def _run_test_profiler_with_sync_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
 
     @dist_init
-    def test_profiler_with_sync_rpc_builtin(self):
+    def test_profiler_with_sync_rpc_udf(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_sync_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    def _run_test_profiler_with_sync_rpc_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1397,13 +1578,29 @@ def test_profiler_with_sync_rpc_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_async_rpc_udf(self):
+    def test_profiler_with_sync_rpc_builtin(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_sync_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    def _run_test_profiler_with_async_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
 
     @dist_init
-    def test_profiler_with_async_rpc_builtin(self):
+    def test_profiler_with_async_rpc_udf(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_async_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    def _run_test_profiler_with_async_rpc_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1413,7 +1610,15 @@ def test_profiler_with_async_rpc_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_remote_udf(self):
+    def test_profiler_with_async_rpc_builtin(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_async_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    def _run_test_profiler_with_remote_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.REMOTE, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, my_sleep_func, args=(1,), use_record_function=True
@@ -1424,7 +1629,15 @@ def test_profiler_with_remote_udf(self):
         )
 
     @dist_init
-    def test_profiler_with_remote_builtin(self):
+    def test_profiler_with_remote_udf(self):
+        self._run_test_profiler_with_remote_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_remote_udf_single_threaded(self):
+        self._run_test_profiler_with_remote_udf()
+
+    def _run_test_profiler_with_remote_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1441,7 +1654,15 @@ def test_profiler_with_remote_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_script_async_rpc(self):
+    def test_profiler_with_remote_builtin(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_remote_builtin_single_threaded(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    def _run_test_profiler_with_script_async_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.ASYNC, my_script_func, args=(torch.tensor(1),)
         )
@@ -1453,7 +1674,15 @@ def test_profiler_with_script_async_rpc(self):
         )
 
     @dist_init
-    def test_profiler_with_script_sync_rpc(self):
+    def test_profiler_with_script_async_rpc(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_async_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    def _run_test_profiler_with_script_sync_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.SYNC, my_script_func, args=(torch.tensor(1),)
         )
@@ -1465,7 +1694,15 @@ def test_profiler_with_script_sync_rpc(self):
         )
 
     @dist_init
-    def test_profiler_with_script_remote_rpc(self):
+    def test_profiler_with_script_sync_rpc(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_sync_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    def _run_test_profiler_with_script_remote_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),)
         )
@@ -1480,6 +1717,14 @@ def test_profiler_with_script_remote_rpc(self):
             RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),), dst=self.rank
         )
 
+    @dist_init
+    def test_profiler_with_script_remote_rpc(self):
+        self._run_test_profiler_with_script_remote_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_remote_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_remote_rpc()
 
     def _assert_top_level_events(self, process_global_events, expected_top_level_event_names):
         top_level_event_names = []
@@ -3298,9 +3543,98 @@ def test_init_rpc_twice(self):
 
         rpc.shutdown()
 
+    def test_wrong_types(self):
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument backend must be a member of BackendType",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend="TENSORPIPE",
+            )
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument rpc_backend_options must be an instance of RpcBackendOptions",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=self.rpc_backend,
+                rpc_backend_options={"init_method": self.init_method}
+            )
+
+    def test_cannot_infer_backend_from_options(self):
+        # An exception should be raised if the backend isn't specified but
+        # options are given which are not an instance of any of the known
+        # agents' option classes.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(TypeError, "Could not infer backend for options"):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                # Do _not_ pass backend.
+                rpc_backend_options=rpc_backend_options,
+            )
+
 
 class ProcessGroupAgentRpcTest(RpcAgentTestFixture):
 
+    def test_mismatched_type_for_options(self):
+        # An exception should be raised if the options are not an instance of
+        # ProcessGroupRpcBackendOptions.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(
+            TypeError, "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`"
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.PROCESS_GROUP,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    def test_infer_backend_from_options(self):
+        rpc_backend_options = rpc.ProcessGroupRpcBackendOptions(
+            init_method=self.init_method
+        )
+
+        with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                # Do _not_ pass backend.
+                rpc_backend_options=rpc_backend_options,
+            )
+        self.assertIn(
+            "To silence this warning pass `backend=BackendType.PROCESS_GROUP` explicitly.",
+            "\n".join(cm.output),
+        )
+
+        self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.ProcessGroupAgent)
+
+    def test_logs_deprecation_warning(self):
+        with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.PROCESS_GROUP,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+        self.assertIn(
+            "It is recommended to migrate to the TENSORPIPE backend.",
+            "\n".join(cm.output),
+        )
+
     @skip_if_lt_x_gpu(2)
     @dist_init
     def test_cuda(self):
@@ -3895,6 +4229,37 @@ def test_rpc_script_timeout(self):
 
 class TensorPipeAgentRpcTest(RpcAgentTestFixture):
 
+    def test_mismatched_type_for_options(self):
+        # An exception should be raised if the options are not an instance of
+        # TensorPipeRpcBackendOptions.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(
+            TypeError, "`rpc_backend_options` must be a `TensorPipeRpcBackendOptions`"
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.TENSORPIPE,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    def test_infer_backend_from_options(self):
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            init_method=self.init_method
+        )
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            rank=self.rank,
+            world_size=self.world_size,
+            # Do _not_ pass backend.
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.TensorPipeAgent)
+
     # FIXME Merge this test with the corresponding one in RpcTest.
     @dist_init(setup_rpc=False)
     def test_set_and_get_num_worker_threads(self):
diff --git a/torch/types.py b/torch/types.py
index 0e386fc3e134..2aee8cd7ddde 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -34,13 +34,25 @@
 class Storage(object):
     _cdata: int
 
+    def __deepcopy__(self, memo) -> 'Storage':
+        ...
+
+    def _new_shared(self, int) -> 'Storage':
+        ...
+
     def _write_file(self, f: Any, is_real_file: _bool, save_size: _bool) -> None:
         ...
 
-    def size(self) -> int:
+    def element_size(self) -> int:
         ...
 
-    def _new_shared(self, int) -> 'Storage':
+    def is_shared(self) -> bool:
+        ...
+
+    def share_memory_(self) -> 'Storage':
+        ...
+
+    def size(self) -> int:
         ...
 
     ...
diff --git a/torch/utils/_benchmark/__init__.py b/torch/utils/_benchmark/__init__.py
deleted file mode 100644
index 30a9543e4544..000000000000
--- a/torch/utils/_benchmark/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from torch.utils._benchmark.utils.common import *
-from torch.utils._benchmark.utils.timer import *
-from torch.utils._benchmark.utils.compare import *
-from torch.utils._benchmark.utils.fuzzer import *
diff --git a/torch/utils/_benchmark/utils/common.py b/torch/utils/_benchmark/utils/common.py
deleted file mode 100644
index 57000cc14897..000000000000
--- a/torch/utils/_benchmark/utils/common.py
+++ /dev/null
@@ -1,268 +0,0 @@
-"""Base shared classes and utilities."""
-
-import collections
-import contextlib
-import logging
-from typing import Any, Dict, List, Optional
-
-import numpy as np
-import torch
-
-
-__all__ = ["Measurement"]
-
-
-_MAX_SIGNIFICANT_FIGURES = 4
-_MIN_CONFIDENCE_INTERVAL = 25e-9  # 25 ns
-
-# Measurement will include a warning if the distribution is suspect. All
-# runs are expected to have some variation; these parameters set the
-# thresholds.
-_IQR_WARN_THRESHOLD = 0.1
-_IQR_GROSS_WARN_THRESHOLD = 0.25
-
-
-class Measurement:
-    """The result of a Timer measurement.
-
-    This class stores one or more measurements of a given statement. It is
-    serializable and provides several convenience methods
-    (including a detailed __repr__) for downstream consumers.
-    """
-    def __init__(
-        self,
-        number_per_run: int,
-        times: List[float],
-        num_threads: int,
-        label: Optional[str],
-        sub_label: Optional[str],
-        description: Optional[str],
-        env: Optional[str],
-        stmt: Optional[str],
-        metadata: Optional[dict] = None,
-    ):
-        self.number_per_run = number_per_run
-        self.times = times
-        self.label = label
-        self.sub_label = sub_label
-        self.description = description
-        self._env = env
-        self.num_threads = num_threads
-        self.stmt = stmt
-        self.metadata = metadata
-
-        # Derived attributes
-        self._sorted_times = sorted([t / number_per_run for t in times])
-        self._median = np.median(self._sorted_times)
-        self._bottom_quartile = np.percentile(self._sorted_times, 25)
-        self._top_quartile = np.percentile(self._sorted_times, 75)
-        self._iqr = self._top_quartile - self._bottom_quartile
-        self._warnings = self._populate_warnings()
-
-    # Pickle support.
-    def __getstate__(self):
-        return {
-            "label": self.label,
-            "sub_label": self.sub_label,
-            "description": self.description,
-            "env": self._env,
-            "num_threads": self.num_threads,
-            "number_per_run": self.number_per_run,
-            "times": self.times,
-            "stmt": self.stmt,
-            "metadata": self.metadata,
-        }
-
-    def __setstate__(self, state: Dict[str, Any]):
-        self.__init__(**state)  # type: ignore
-
-    def meets_confidence(self, threshold=_IQR_WARN_THRESHOLD):
-        return self._iqr / self._median < threshold
-
-    def _populate_warnings(self):
-        warnings, rel_iqr = [], self._iqr / self._median * 100
-
-        def add_warning(msg):
-            warnings.append(
-                f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
-                f"of the median measurement.\n           {msg}"
-            )
-
-        if self._iqr / self._median > _IQR_GROSS_WARN_THRESHOLD:
-            add_warning("This suggests significant environmental influence.")
-        elif not self.meets_confidence():
-            add_warning("This could indicate system fluctuation.")
-        return warnings
-
-    @property
-    def median(self) -> float:
-        return self._median
-
-    @property
-    def significant_figures(self) -> int:
-        """Approximate significant figure estimate.
-
-        This property is intended to give a convenient way to estimate the
-        precision of a measurement. It only uses the interquartile region to
-        estimate statistics to try to mitigate skew from the tails, and
-        uses a static z value of 1.645 since it is not expected to be used
-        for small values of `n`, so z can approximate `t`.
-
-        The significant figure estimation used in conjunction with the
-        `trim_sigfig` method to provide a more human interpretable data
-        summary. __repr__ does not use this method; it simply displays raw
-        values. Significant figure estimation is intended for `Compare`.
-        """
-        n_total = len(self._sorted_times)
-        lower_bound = int(n_total // 4)
-        upper_bound = int(np.ceil(3 * n_total / 4))
-        interquartile_points = self._sorted_times[lower_bound:upper_bound]
-        std = np.std(interquartile_points)
-        sqrt_n = np.sqrt(len(interquartile_points))
-
-        # Rough estimates. These are by no means statistically rigorous.
-        confidence_interval = max(1.645 * std / sqrt_n, _MIN_CONFIDENCE_INTERVAL)
-        relative_ci = np.log10(self._median / confidence_interval)
-        num_significant_figures = int(np.floor(relative_ci))
-        return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
-
-    @property
-    def title(self) -> str:
-        """Best effort attempt at a string label for the measurement."""
-        if self.label is not None:
-            label = self.label
-        elif isinstance(self.stmt, str):
-            label = self.stmt
-        else:
-            label = "[Missing primary label]"
-
-        return label + (f": {self.sub_label}" if self.sub_label else "")
-
-    @property
-    def env(self) -> str:
-        return "Unspecified env" if self._env is None else self._env
-
-    @property
-    def as_row_name(self) -> str:
-        return self.sub_label or self.stmt or "[Unknown]"
-
-    @property
-    def has_warnings(self):
-        return bool(self._warnings)
-
-    def __repr__(self):
-        """
-        Example repr:
-            <utils.common.Measurement object at 0x7f395b6ac110>
-              Broadcasting add (4x8)
-              Median: 5.73 us
-              IQR:    2.25 us (4.01 to 6.26)
-              372 measurements, 100 runs per measurement, 1 thread
-              WARNING: Interquartile range is 39.4% of the median measurement.
-                       This suggests significant environmental influence.
-        """
-        repr = [super().__repr__(), "\n", self.title, "\n"]
-        if self.description:
-            repr.extend([self.description, "\n"])
-        n = len(self._sorted_times)
-
-        time_unit, time_scale = select_unit(self.median)
-        repr.extend([
-            f"  {'Median: ' if n > 1 else ''}"
-            f"{self._median / time_scale:.2f} {time_unit}\n"
-        ])
-        if n >= 4:
-            repr.extend(
-                [
-                    f"  IQR:    {self._iqr / time_scale:.2f} {time_unit} "
-                    f"({self._bottom_quartile / time_scale:.2f} to "
-                    f"{self._top_quartile / time_scale:.2f})\n",
-                ]
-            )
-        repr.extend(
-            [
-                f"  {len(self.times)} measurement{'s' if n > 1 else ''}, "
-                f"{self.number_per_run} runs {'per measurement,' if n > 1 else ','} "
-                f"{self.num_threads} thread{'s' if self.num_threads > 1 else ''}\n"
-            ]
-        )
-        repr.extend(self._warnings)
-
-        return "".join(repr).strip()
-
-
-def select_unit(t: float):
-    """Determine how to scale times for O(1) magnitude.
-
-    This utility is used to format numbers for human consumption.
-    """
-    time_unit = {-3: "ns", -2: "us", -1: "ms"}.get(int(np.log10(t) // 3), "s")
-    time_scale = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1}[time_unit]
-    return time_unit, time_scale
-
-
-def unit_to_english(u: str) -> str:
-    return {
-        "ns": "nanosecond",
-        "us": "microsecond",
-        "ms": "millisecond",
-        "s": "second",
-    }[u]
-
-
-def trim_sigfig(x: float, n: int) -> float:
-    """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
-    assert n == int(n)
-    magnitude = int(np.ceil(np.log10(np.abs(x))))
-    scale = 10 ** (magnitude - n)
-    return np.round(x / scale) * scale
-
-
-def ordered_unique(elements):
-    return list(collections.OrderedDict({i: None for i in elements}).keys())
-
-
-def merge_measurements(measurements: List[Measurement]):
-    grouped_measurements = collections.defaultdict(list)
-    for m in measurements:
-        key = (m.label, m.sub_label, m.description, m.env, m.num_threads)
-        grouped_measurements[key].append(m)
-
-    def merge_group(label, sub_label, description, env, num_threads, group):
-        times = []
-        for m in group:
-            # Different measurements could have different `number_per_run`.
-            times.extend([t / m.number_per_run for t in m.times])
-        unique_stmts = {m.stmt for m in group}
-        if len(unique_stmts) != 1:
-            logging.warning(
-                "Merged Examples with identical `label`, `sub_label`,\n"
-                "`description`, `env`, and `num_threads`, but different"
-                "`stmt`s:\n  " + "\n  ".join(unique_stmts)
-            )
-        return Measurement(
-            number_per_run=1,
-            times=times,
-            num_threads=num_threads,
-            label=label,
-            sub_label=sub_label,
-            description=description,
-            env=env,
-            stmt=unique_stmts.pop(),
-            metadata=None,
-        )
-
-    return [
-        merge_group(*(key + (group,)))
-        for key, group in grouped_measurements.items()
-    ]
-
-
-@contextlib.contextmanager
-def set_torch_threads(n: int):
-    prior_num_threads = torch.get_num_threads()
-    try:
-        torch.set_num_threads(n)
-        yield
-    finally:
-        torch.set_num_threads(prior_num_threads)
diff --git a/torch/utils/_benchmark/utils/timer.py b/torch/utils/_benchmark/utils/timer.py
deleted file mode 100644
index 00260b49f99f..000000000000
--- a/torch/utils/_benchmark/utils/timer.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""Timer class based on the timeit.Timer class, but torch aware."""
-
-import timeit
-from typing import List, Optional
-
-import numpy as np
-import torch
-from torch.utils._benchmark.utils import common
-
-
-__all__ = ["Timer"]
-
-
-if torch.has_cuda and torch.cuda.is_available():
-    def timer():
-        torch.cuda.synchronize()
-        return timeit.default_timer()
-else:
-    timer = timeit.default_timer
-
-
-class Timer(object):
-    def __init__(
-        self,
-        stmt="pass",
-        setup="pass",
-        timer=timer,
-        globals: Optional[dict] = None,
-        label: Optional[str] = None,
-        sub_label: Optional[str] = None,
-        description: Optional[str] = None,
-        env: Optional[str] = None,
-        num_threads=1,
-    ):
-        if not isinstance(stmt, str):
-            raise ValueError("Currently only a `str` stmt is supported.")
-
-        # We copy `globals` to prevent mutations from leaking, (for instance,
-        # `eval` adds the `__builtins__` key) and include `torch` if not
-        # specified as a convenience feature.
-        globals = dict(globals or {})
-        globals.setdefault("torch", torch)
-
-        self._stmt = stmt
-        self._label = label
-        self._sub_label = sub_label
-        self._description = description
-        self._env = env
-        self._num_threads = num_threads
-        self._timer = timeit.Timer(stmt=stmt, setup=setup, timer=timer, globals=globals)
-
-    def _construct_measurement(self, number_per_run: int, times: List[float]):
-        return common.Measurement(
-            number_per_run=number_per_run,
-            times=times,
-            num_threads=self._num_threads,
-            label=self._label,
-            sub_label=self._sub_label,
-            description=self._description,
-            env=self._env,
-            stmt=self._stmt,
-        )
-
-    def timeit(self, number=1000000):
-        # Warmup
-        self._timer.timeit(number=max(int(number // 100), 1))
-        with common.set_torch_threads(self._num_threads):
-            return self._construct_measurement(
-                number_per_run=number, times=[self._timer.timeit(number=number)]
-            )
-
-    def repeat(self, repeat=-1, number=-1):
-        raise NotImplementedError("See `Timer.blocked_autorange.`")
-
-    def autorange(self, callback=None):
-        raise NotImplementedError("See `Timer.blocked_autorange.`")
-
-    def _threaded_measurement_loop(self, number, time_hook, stop_hook, min_run_time: float,
-                                   max_run_time: Optional[float] = None, callback=None):
-        total_time = 0.0
-        can_stop = False
-        times = []
-        with common.set_torch_threads(self._num_threads):
-            while (total_time < min_run_time) or (not can_stop):
-                time_spent = time_hook()
-                times.append(time_spent)
-                total_time += time_spent
-                if callback:
-                    callback(number, time_spent)
-                can_stop = stop_hook(times)
-                if max_run_time and total_time > max_run_time:
-                    break
-        return times
-
-    def adaptive_autorange(self, threshold=0.1, max_run_time=10, callback=None, min_run_time=0.01):
-        number = self._estimate_block_size(min_run_time=0.05)
-
-        def time_hook():
-            return self._timer.timeit(number)
-
-        def stop_hook(times):
-            if len(times) > 3:
-                measure = self._construct_measurement(number, times)
-                return measure.meets_confidence(threshold=threshold)
-            return False
-        times = self._threaded_measurement_loop(number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
-        measure = self._construct_measurement(number, times)
-        return measure
-
-    def _estimate_block_size(self, min_run_time):
-        with common.set_torch_threads(self._num_threads):
-            # Estimate the block size needed for measurement to be negligible
-            # compared to the inner loop. This also serves as a warmup.
-            overhead = np.median([self._timer.timeit(0) for _ in range(5)])
-            number = 1
-            while True:
-                time_taken = self._timer.timeit(number)
-                relative_overhead = overhead / time_taken
-                if relative_overhead <= 1e-4 and time_taken >= min_run_time / 1000:
-                    break
-                if time_taken > min_run_time:
-                    break
-                number *= 10
-        return number
-
-    def blocked_autorange(self, callback=None, min_run_time=0.2):
-        number = self._estimate_block_size(min_run_time)
-
-        def time_hook():
-            return self._timer.timeit(number)
-
-        def stop_hook(times):
-            return True
-        times = self._threaded_measurement_loop(number, time_hook, stop_hook, min_run_time=min_run_time,
-                                                callback=callback)
-        return self._construct_measurement(number_per_run=number, times=times)
diff --git a/torch/utils/_benchmark/README.md b/torch/utils/benchmark/README.md
similarity index 98%
rename from torch/utils/_benchmark/README.md
rename to torch/utils/benchmark/README.md
index e432a553e6ba..4a64b778181f 100644
--- a/torch/utils/_benchmark/README.md
+++ b/torch/utils/benchmark/README.md
@@ -15,8 +15,8 @@ into two broad categories:
 
 ### Integration and better measurement:
 
-  `Timer`, while modeled after the `timit` analog, uses a slightly different
-  API from `timit.Timer`.
+  `Timer`, while modeled after the `timeit` analog, uses a slightly different
+  API from `timeit.Timer`.
 
   * The constructor accepts additional metadata and timing methods return
   a `Measurement` class rather than a float. This `Measurement` class is
diff --git a/torch/utils/benchmark/__init__.py b/torch/utils/benchmark/__init__.py
new file mode 100644
index 000000000000..8ce3ff36fe4b
--- /dev/null
+++ b/torch/utils/benchmark/__init__.py
@@ -0,0 +1,4 @@
+from torch.utils.benchmark.utils.common import *
+from torch.utils.benchmark.utils.timer import *
+from torch.utils.benchmark.utils.compare import *
+from torch.utils.benchmark.utils.fuzzer import *
diff --git a/torch/utils/_benchmark/op_fuzzers/__init__.py b/torch/utils/benchmark/examples/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/op_fuzzers/__init__.py
rename to torch/utils/benchmark/examples/__init__.py
diff --git a/torch/utils/_benchmark/examples/compare.py b/torch/utils/benchmark/examples/compare.py
similarity index 98%
rename from torch/utils/_benchmark/examples/compare.py
rename to torch/utils/benchmark/examples/compare.py
index 3373149c7039..f1688976af37 100644
--- a/torch/utils/_benchmark/examples/compare.py
+++ b/torch/utils/benchmark/examples/compare.py
@@ -9,7 +9,7 @@
 
 import torch
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 class FauxTorch(object):
diff --git a/torch/utils/_benchmark/examples/end_to_end.py b/torch/utils/benchmark/examples/end_to_end.py
similarity index 99%
rename from torch/utils/_benchmark/examples/end_to_end.py
rename to torch/utils/benchmark/examples/end_to_end.py
index b275b9a076a2..942c20e54173 100644
--- a/torch/utils/_benchmark/examples/end_to_end.py
+++ b/torch/utils/benchmark/examples/end_to_end.py
@@ -26,8 +26,8 @@
 
 import numpy as np
 import torch
-from torch.utils._benchmark.op_fuzzers import unary
-from torch.utils._benchmark import Timer, Measurement
+from torch.utils.benchmark.op_fuzzers import unary
+from torch.utils.benchmark import Timer, Measurement
 from typing import Dict, Tuple, List
 
 
diff --git a/torch/utils/_benchmark/examples/fuzzer.py b/torch/utils/benchmark/examples/fuzzer.py
similarity index 98%
rename from torch/utils/_benchmark/examples/fuzzer.py
rename to torch/utils/benchmark/examples/fuzzer.py
index 157782de4ccd..4446e2d85c0a 100644
--- a/torch/utils/_benchmark/examples/fuzzer.py
+++ b/torch/utils/benchmark/examples/fuzzer.py
@@ -5,7 +5,7 @@
 
 import sys
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 def main():
diff --git a/torch/utils/_benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
similarity index 95%
rename from torch/utils/_benchmark/examples/op_benchmark.py
rename to torch/utils/benchmark/examples/op_benchmark.py
index 1d3cc618fa35..65b69d84b41f 100644
--- a/torch/utils/_benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -6,9 +6,9 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Timer
-from torch.utils._benchmark.op_fuzzers.binary import BinaryOpFuzzer
-from torch.utils._benchmark.op_fuzzers.unary import UnaryOpFuzzer
+from torch.utils.benchmark import Timer
+from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
+from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
 
 
 _MEASURE_TIME = 1.0
diff --git a/torch/utils/_benchmark/examples/prepare_e2e.sh b/torch/utils/benchmark/examples/prepare_e2e.sh
similarity index 100%
rename from torch/utils/_benchmark/examples/prepare_e2e.sh
rename to torch/utils/benchmark/examples/prepare_e2e.sh
diff --git a/torch/utils/_benchmark/examples/simple_timeit.py b/torch/utils/benchmark/examples/simple_timeit.py
similarity index 90%
rename from torch/utils/_benchmark/examples/simple_timeit.py
rename to torch/utils/benchmark/examples/simple_timeit.py
index 4bd76ce4cceb..81aaa6dee981 100644
--- a/torch/utils/_benchmark/examples/simple_timeit.py
+++ b/torch/utils/benchmark/examples/simple_timeit.py
@@ -5,7 +5,7 @@
 
 import torch
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 def main():
diff --git a/torch/utils/_benchmark/utils/__init__.py b/torch/utils/benchmark/op_fuzzers/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/utils/__init__.py
rename to torch/utils/benchmark/op_fuzzers/__init__.py
diff --git a/torch/utils/_benchmark/op_fuzzers/binary.py b/torch/utils/benchmark/op_fuzzers/binary.py
similarity index 97%
rename from torch/utils/_benchmark/op_fuzzers/binary.py
rename to torch/utils/benchmark/op_fuzzers/binary.py
index 848cc7c36875..91289d88db8a 100644
--- a/torch/utils/_benchmark/op_fuzzers/binary.py
+++ b/torch/utils/benchmark/op_fuzzers/binary.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
 
 
 _MIN_DIM_SIZE = 16
diff --git a/torch/utils/_benchmark/op_fuzzers/unary.py b/torch/utils/benchmark/op_fuzzers/unary.py
similarity index 97%
rename from torch/utils/_benchmark/op_fuzzers/unary.py
rename to torch/utils/benchmark/op_fuzzers/unary.py
index 10cee4316c1c..a0f810d0b9fa 100644
--- a/torch/utils/_benchmark/op_fuzzers/unary.py
+++ b/torch/utils/benchmark/op_fuzzers/unary.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
 
 
 _MIN_DIM_SIZE = 16
diff --git a/torch/utils/benchmark/utils/__init__.py b/torch/utils/benchmark/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
new file mode 100644
index 000000000000..fc27c9fdf6b7
--- /dev/null
+++ b/torch/utils/benchmark/utils/common.py
@@ -0,0 +1,263 @@
+"""Base shared classes and utilities."""
+
+import collections
+import contextlib
+import dataclasses
+from typing import DefaultDict, List, Optional, Tuple
+
+import numpy as np
+import torch
+
+
+__all__ = ["Measurement"]
+
+
+_MAX_SIGNIFICANT_FIGURES = 4
+_MIN_CONFIDENCE_INTERVAL = 25e-9  # 25 ns
+
+# Measurement will include a warning if the distribution is suspect. All
+# runs are expected to have some variation; these parameters set the
+# thresholds.
+_IQR_WARN_THRESHOLD = 0.1
+_IQR_GROSS_WARN_THRESHOLD = 0.25
+
+
+@dataclasses.dataclass(init=True, repr=False, eq=True, frozen=True)
+class TaskSpec:
+    """Container for information used to define a Timer. (except globals)"""
+    stmt: str
+    setup: str
+    label: Optional[str]
+    sub_label: Optional[str]
+    description: Optional[str]
+    env: Optional[str]
+    num_threads: int
+_TASKSPEC_FIELDS = tuple(i.name for i in dataclasses.fields(TaskSpec))
+
+@dataclasses.dataclass(init=True, repr=False)
+class Measurement:
+    """The result of a Timer measurement.
+
+    This class stores one or more measurements of a given statement. It is
+    serializable and provides several convenience methods
+    (including a detailed __repr__) for downstream consumers.
+    """
+    number_per_run: int
+    raw_times: List[float]
+    task_spec: TaskSpec
+    metadata: Optional[dict] = None
+
+    def __post_init__(self):
+        self._sorted_times: Tuple[float, ...] = ()
+        self._warnings: Tuple[str, ...] = ()
+        self._median: float = -1.0
+        self._mean: float = -1.0
+        self._p25: float = -1.0
+        self._p75: float = -1.0
+
+    def __getattr__(self, name):
+        # Forward TaskSpec fields for convenience.
+        if name in _TASKSPEC_FIELDS:
+            return getattr(self.task_spec, name)
+        return super().__getattribute__(name)
+
+    # =========================================================================
+    # == Convenience methods for statistics ===================================
+    # =========================================================================
+    #
+    # These methods use raw time divided by number_per_run; this is an
+    # extrapolation and hides the fact that different number_per_run will
+    # result in different amortization of overheads, however if Timer has
+    # selected an appropriate number_per_run then this is a non-issue, and
+    # forcing users to handle that division would result in a poor experience.
+    @property
+    def times(self) -> List[float]:
+        return [t / self.number_per_run for t in self.raw_times]
+
+    @property
+    def median(self) -> float:
+        self._lazy_init()
+        return self._median
+
+    @property
+    def mean(self) -> float:
+        self._lazy_init()
+        return self._mean
+
+    @property
+    def iqr(self) -> float:
+        self._lazy_init()
+        return self._p75 - self._p25
+
+    @property
+    def significant_figures(self) -> int:
+        """Approximate significant figure estimate.
+
+        This property is intended to give a convenient way to estimate the
+        precision of a measurement. It only uses the interquartile region to
+        estimate statistics to try to mitigate skew from the tails, and
+        uses a static z value of 1.645 since it is not expected to be used
+        for small values of `n`, so z can approximate `t`.
+
+        The significant figure estimation used in conjunction with the
+        `trim_sigfig` method to provide a more human interpretable data
+        summary. __repr__ does not use this method; it simply displays raw
+        values. Significant figure estimation is intended for `Compare`.
+        """
+        self._lazy_init()
+        n_total = len(self._sorted_times)
+        lower_bound = int(n_total // 4)
+        upper_bound = int(np.ceil(3 * n_total / 4))
+        interquartile_points: Tuple[float, ...] = self._sorted_times[lower_bound:upper_bound]
+        std = np.std(interquartile_points)
+        sqrt_n = np.sqrt(len(interquartile_points))
+
+        # Rough estimates. These are by no means statistically rigorous.
+        confidence_interval = max(1.645 * std / sqrt_n, _MIN_CONFIDENCE_INTERVAL)
+        relative_ci = np.log10(self._median / confidence_interval)
+        num_significant_figures = int(np.floor(relative_ci))
+        return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
+
+    @property
+    def has_warnings(self) -> bool:
+        self._lazy_init()
+        return bool(self._warnings)
+
+    def _lazy_init(self):
+        if self.raw_times and not self._sorted_times:
+            self._sorted_times = tuple(sorted(self.times))
+            self._median = np.median(self._sorted_times)
+            self._mean = np.mean(self._sorted_times)
+            self._p25 = np.percentile(self._sorted_times, 25)
+            self._p75 = np.percentile(self._sorted_times, 75)
+
+            def add_warning(msg):
+                rel_iqr = self.iqr / self.median * 100
+                self._warnings += (
+                    f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
+                    f"of the median measurement.\n           {msg}",
+                )
+
+            if not self.meets_confidence(_IQR_GROSS_WARN_THRESHOLD):
+                add_warning("This suggests significant environmental influence.")
+            elif not self.meets_confidence(_IQR_WARN_THRESHOLD):
+                add_warning("This could indicate system fluctuation.")
+
+
+    def meets_confidence(self, threshold=_IQR_WARN_THRESHOLD) -> bool:
+        return self.iqr / self.median < threshold
+
+    @property
+    def title(self) -> str:
+        """Best effort attempt at a string label for the measurement."""
+        if self.label is not None:
+            label = self.label
+        elif isinstance(self.stmt, str):
+            label = self.stmt
+        else:
+            label = "[Missing primary label]"
+
+        return label + (f": {self.sub_label}" if self.sub_label else "")
+
+    @property
+    def env(self) -> str:
+        return "Unspecified env" if self.taskspec.env is None else self.taskspec.env
+
+    @property
+    def as_row_name(self) -> str:
+        return self.sub_label or self.stmt or "[Unknown]"
+
+    def __repr__(self):
+        """
+        Example repr:
+            <utils.common.Measurement object at 0x7f395b6ac110>
+              Broadcasting add (4x8)
+              Median: 5.73 us
+              IQR:    2.25 us (4.01 to 6.26)
+              372 measurements, 100 runs per measurement, 1 thread
+              WARNING: Interquartile range is 39.4% of the median measurement.
+                       This suggests significant environmental influence.
+        """
+        self._lazy_init()
+        skip_line, newline = "MEASUREMENT_REPR_SKIP_LINE", "\n"
+        n = len(self._sorted_times)
+        time_unit, time_scale = select_unit(self._median)
+        iqr_filter = '' if n >= 4 else skip_line
+
+        repr_str = f"""
+{super().__repr__()}
+{self.title}
+  {self.description or skip_line}
+  {'Median: ' if n > 1 else ''}{self._median / time_scale:.2f} {time_unit}
+  {iqr_filter}IQR:    {self.iqr / time_scale:.2f} {time_unit} ({self._p25 / time_scale:.2f} to {self._p75 / time_scale:.2f})
+  {n} measurement{'s' if n > 1 else ''}, {self.number_per_run} runs {'per measurement,' if n > 1 else ','} {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+{newline.join(self._warnings)}""".strip() # noqa
+
+        return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
+
+    @staticmethod
+    def merge(measurements):
+        """Convenience method for merging replicates.
+        NB: merge will extrapolate times to `number_per_run=1` and will not
+            transfer any metadata (since it might differ between replicates)
+        """
+        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
+        for m in measurements:
+            grouped_measurements[m.task_spec].append(m)
+
+        def merge_group(task_spec, group):
+            times: List[float] = []
+            for m in group:
+                # Different measurements could have different `number_per_run`,
+                # so we call `.times` which normalizes the results.
+                times.extend(m.times)
+
+            return Measurement(
+                number_per_run=1,
+                raw_times=times,
+                task_spec=task_spec,
+                metadata=None,
+            )
+
+        return [merge_group(t, g) for t, g in grouped_measurements.items()]
+
+
+def select_unit(t: float):
+    """Determine how to scale times for O(1) magnitude.
+
+    This utility is used to format numbers for human consumption.
+    """
+    time_unit = {-3: "ns", -2: "us", -1: "ms"}.get(int(np.log10(t) // 3), "s")
+    time_scale = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1}[time_unit]
+    return time_unit, time_scale
+
+
+def unit_to_english(u: str) -> str:
+    return {
+        "ns": "nanosecond",
+        "us": "microsecond",
+        "ms": "millisecond",
+        "s": "second",
+    }[u]
+
+
+def trim_sigfig(x: float, n: int) -> float:
+    """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
+    assert n == int(n)
+    magnitude = int(np.ceil(np.log10(np.abs(x))))
+    scale = 10 ** (magnitude - n)
+    return np.round(x / scale) * scale
+
+
+def ordered_unique(elements):
+    return list(collections.OrderedDict({i: None for i in elements}).keys())
+
+
+@contextlib.contextmanager
+def set_torch_threads(n: int):
+    prior_num_threads = torch.get_num_threads()
+    try:
+        torch.set_num_threads(n)
+        yield
+    finally:
+        torch.set_num_threads(prior_num_threads)
diff --git a/torch/utils/_benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
similarity index 70%
rename from torch/utils/_benchmark/utils/compare.py
rename to torch/utils/benchmark/utils/compare.py
index 6f29b67fef8c..ab3fed104ab6 100644
--- a/torch/utils/_benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -1,11 +1,11 @@
 """Display class to aggregate and print the results of many measurements."""
 import collections
 import itertools as it
-from typing import cast, List, Optional, Tuple
+from typing import DefaultDict, List, Optional, Tuple
 
 import numpy as np
 
-from torch.utils._benchmark.utils import common
+from torch.utils.benchmark.utils import common
 
 __all__ = ["Compare"]
 
@@ -21,7 +21,7 @@
 class _Column(object):
     def __init__(
         self,
-        grouped_results: List[Tuple[common.Measurement, ...]],
+        grouped_results: List[Tuple[Optional[common.Measurement], ...]],
         time_scale: float,
         time_unit: str,
         trim_significant_figures: bool,
@@ -32,15 +32,19 @@ def __init__(
         self._time_scale = time_scale
         self._time_unit = time_unit
         self._trim_significant_figures = trim_significant_figures
-        self._highlight_warnings = highlight_warnings and any(r.has_warnings for r in self._flat_results)
+        self._highlight_warnings = (
+            highlight_warnings
+            and any(r.has_warnings for r in self._flat_results if r)
+        )
         leading_digits = [
-            int(np.ceil(np.log10(r.median / self._time_scale)))
+            int(np.ceil(np.log10(r.median / self._time_scale))) if r else None
             for r in self._flat_results
         ]
-        unit_digits = max(leading_digits)
+        unit_digits = max(d for d in leading_digits if d is not None)
         decimal_digits = min(
             max(m.significant_figures - digits, 0)
             for digits, m in zip(leading_digits, self._flat_results)
+            if (m is not None) and (digits is not None)
         ) if self._trim_significant_figures else 1
         length = unit_digits + decimal_digits + (1 if decimal_digits else 0)
         self._template = f"{{:>{length}.{decimal_digits}f}}{{:>{7 if self._highlight_warnings else 0}}}"
@@ -48,12 +52,16 @@ def __init__(
     def get_results_for(self, group):
         return self._grouped_results[group]
 
-    def num_to_str(self, value: float, estimated_sigfigs: int, spread: Optional[float]):
+    def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
+        if value is None:
+            return " " * len(self.num_to_str(1, estimated_sigfigs, None))
+
         if self._trim_significant_figures:
             value = common.trim_sigfig(value, estimated_sigfigs)
+
         return self._template.format(
             value,
-            f" (! {spread:.0f}%)" if self._highlight_warnings and spread is not None else "")
+            f" (! {spread * 100:.0f}%)" if self._highlight_warnings and spread is not None else "")
 
 
 class _Row(object):
@@ -67,27 +75,31 @@ def __init__(self, results, row_group, render_env, env_str_len,
         self._row_name_str_len = row_name_str_len
         self._time_scale = time_scale
         self._colorize = colorize
-        self._columns = None
+        self._columns: Tuple[_Column, ...] = ()
         self._num_threads = num_threads
 
     def register_columns(self, columns: Tuple[_Column, ...]):
         self._columns = columns
 
     def as_column_strings(self):
-        env = f"({self._results[0].env})" if self._render_env else ""
+        concrete_results = [r for r in self._results if r is not None]
+        env = f"({concrete_results[0].env})" if self._render_env else ""
         env = env.ljust(self._env_str_len + 4)
-        output = ["  " + env + self._results[0].as_row_name]
+        output = ["  " + env + concrete_results[0].as_row_name]
         for m, col in zip(self._results, self._columns or ()):
-            output.append(col.num_to_str(
-                m.median / self._time_scale,
-                m.significant_figures,
-                m.median / m._iqr if m.has_warnings else None
-            ))
+            if m is None:
+                output.append(col.num_to_str(None, 1, None))
+            else:
+                output.append(col.num_to_str(
+                    m.median / self._time_scale,
+                    m.significant_figures,
+                    m.iqr / m.median if m.has_warnings else None
+                ))
         return output
 
     @staticmethod
     def color_segment(segment, value, group_values):
-        best_value = min(group_values)
+        best_value = min([v for v in group_values if v is not None])
         if value <= best_value * 1.01 or value <= best_value + 100e-9:
             return BEST + BOLD + segment + TERMINATE * 2
         if value <= best_value * 1.1:
@@ -109,16 +121,21 @@ def finalize_column_strings(self, column_strings, col_widths):
         row_contents = [column_strings[0].ljust(col_widths[0])]
         for col_str, width, result, column in zip(column_strings[1:], col_widths[1:], self._results, self._columns or ()):
             col_str = col_str.center(width)
-            if self._colorize:
-                group_medians = [r.median for r in column.get_results_for(self._row_group)]
+            if self._colorize and result is not None:
+                group_medians = [None if r is None else r.median for r in column.get_results_for(self._row_group)]
                 col_str = self.color_segment(col_str, result.median, group_medians)
             row_contents.append(col_str)
         return row_contents
 
 
 class Table(object):
-    def __init__(self, results: List[common.Measurement], colorize: bool,
-                 trim_significant_figures: bool, highlight_warnings: bool):
+    def __init__(
+            self,
+            results: List[common.Measurement],
+            colorize: bool,
+            trim_significant_figures: bool,
+            highlight_warnings: bool
+    ):
         assert len(set(r.label for r in results)) == 1
 
         self.results = results
@@ -136,17 +153,20 @@ def __init__(self, results: List[common.Measurement], colorize: bool,
         self.rows, self.columns = self.populate_rows_and_columns()
 
     @staticmethod
-    def row_fn(m: common.Measurement):
+    def row_fn(m: common.Measurement) -> Tuple[int, Optional[str], str]:
         return m.num_threads, m.env, m.as_row_name
 
     @staticmethod
-    def col_fn(m: common.Measurement):
+    def col_fn(m: common.Measurement) -> Optional[str]:
         return m.description
 
-    def populate_rows_and_columns(self):
-        rows, columns = [], []
-
-        ordered_results: List[List[Optional[common.Measurement]]] = [[None for _ in self.column_keys] for _ in self.row_keys]
+    def populate_rows_and_columns(self) -> Tuple[Tuple[_Row, ...], Tuple[_Column, ...]]:
+        rows: List[_Row] = []
+        columns: List[_Column] = []
+        ordered_results: List[List[Optional[common.Measurement]]] = [
+            [None for _ in self.column_keys]
+            for _ in self.row_keys
+        ]
         row_position = {key: i for i, key in enumerate(self.row_keys)}
         col_position = {key: i for i, key in enumerate(self.column_keys)}
         for r in self.results:
@@ -187,10 +207,7 @@ def populate_rows_and_columns(self):
             prior_env = env
 
         for i in range(len(self.column_keys)):
-            grouped_results = cast(
-                List[Tuple[common.Measurement, ...]],  # All Nones should be gone.
-                [tuple(row[i] for row in g) for g in rows_by_group],
-            )
+            grouped_results = [tuple(row[i] for row in g) for g in rows_by_group]
             column = _Column(
                 grouped_results=grouped_results,
                 time_scale=self.time_scale,
@@ -204,13 +221,13 @@ def populate_rows_and_columns(self):
             ri.register_columns(columns_tuple)
         return rows_tuple, columns_tuple
 
-    def render(self):
+    def render(self) -> str:
         string_rows = [[""] + self.column_keys]
         for r in self.rows:
             string_rows.append(r.as_column_strings())
         num_cols = max(len(i) for i in string_rows)
-        for r in string_rows:
-            r.extend(["" for _ in range(num_cols - len(r))])
+        for sr in string_rows:
+            sr.extend(["" for _ in range(num_cols - len(sr))])
 
         col_widths = [max(len(j) for j in i) for i in zip(*string_rows)]
         finalized_columns = ["  |  ".join(i.center(w) for i, w in zip(string_rows[0], col_widths))]
@@ -218,12 +235,15 @@ def render(self):
         for string_row, row in zip(string_rows[1:], self.rows):
             finalized_columns.extend(row.row_separator(overall_width))
             finalized_columns.append("  |  ".join(row.finalize_column_strings(string_row, col_widths)))
-        print("[" + (" " + (self.label or "") + " ").center(overall_width - 2, "-") + "]")
-        print("\n".join(finalized_columns))
-        print(f"\nTimes are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).")
-        if self._highlight_warnings and any(r.has_warnings for r in self.results):
-            print("(! XX%) Measurement has high variance, where XX is the median / IQR * 100.")
-        print("\n")
+
+        newline = "\n"
+        has_warnings = self._highlight_warnings and any(ri.has_warnings for ri in self.results)
+        return f"""
+[{(' ' + (self.label or '') + ' ').center(overall_width - 2, '-')}]
+{newline.join(finalized_columns)}
+
+Times are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).
+{'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
 
 
 class Compare(object):
@@ -234,6 +254,9 @@ def __init__(self, results: List[common.Measurement]):
         self._colorize = False
         self._highlight_warnings = False
 
+    def __str__(self):
+        return "\n".join(self._render())
+
     def extend_results(self, results):
         for r in results:
             if not isinstance(r, common.Measurement):
@@ -252,21 +275,27 @@ def highlight_warnings(self):
         self._highlight_warnings = True
 
     def print(self):
-        self._render()
+        print(str(self))
 
     def _render(self):
-        results = common.merge_measurements(self._results)
+        results = common.Measurement.merge(self._results)
         results = self._group_by_label(results)
+        output = []
         for group in results.values():
-            self._layout(group)
+            output.append(self._layout(group))
+        return output
 
-    def _group_by_label(self, results):
-        grouped_results = collections.defaultdict(list)
+    def _group_by_label(self, results: List[common.Measurement]):
+        grouped_results: DefaultDict[str, List[common.Measurement]] = collections.defaultdict(list)
         for r in results:
             grouped_results[r.label].append(r)
         return grouped_results
 
     def _layout(self, results: List[common.Measurement]):
-        table = Table(results, self._colorize, self._trim_significant_figures,
-                      self._highlight_warnings)
-        table.render()
+        table = Table(
+            results,
+            self._colorize,
+            self._trim_significant_figures,
+            self._highlight_warnings
+        )
+        return table.render()
diff --git a/torch/utils/_benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
similarity index 100%
rename from torch/utils/_benchmark/utils/fuzzer.py
rename to torch/utils/benchmark/utils/fuzzer.py
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
new file mode 100644
index 000000000000..d4c321156da9
--- /dev/null
+++ b/torch/utils/benchmark/utils/timer.py
@@ -0,0 +1,188 @@
+"""Timer class based on the timeit.Timer class, but torch aware."""
+
+import timeit
+from typing import Callable, List, NoReturn, Optional
+
+import numpy as np
+import torch
+from torch.utils.benchmark.utils import common
+from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
+
+
+__all__ = ["Timer", "timer"]
+
+
+if torch.has_cuda and torch.cuda.is_available():
+    def timer():
+        torch.cuda.synchronize()
+        return timeit.default_timer()
+else:
+    timer = timeit.default_timer
+
+
+class Timer(object):
+    _timer_cls = timeit.Timer
+
+    def __init__(
+        self,
+        stmt="pass",
+        setup="pass",
+        timer=timer,
+        globals: Optional[dict] = None,
+        label: Optional[str] = None,
+        sub_label: Optional[str] = None,
+        description: Optional[str] = None,
+        env: Optional[str] = None,
+        num_threads=1,
+    ):
+        if not isinstance(stmt, str):
+            raise ValueError("Currently only a `str` stmt is supported.")
+
+        # We copy `globals` to prevent mutations from leaking, (for instance,
+        # `eval` adds the `__builtins__` key) and include `torch` if not
+        # specified as a convenience feature.
+        globals = dict(globals or {})
+        globals.setdefault("torch", torch)
+        self._globals = globals
+
+        self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
+        self._task_spec = common.TaskSpec(
+            stmt=stmt,
+            setup=setup,
+            label=label,
+            sub_label=sub_label,
+            description=description,
+            env=env,
+            num_threads=num_threads,
+        )
+
+    def timeit(self, number=1000000):
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Warmup
+            self._timer.timeit(number=max(int(number // 100), 1))
+
+            return common.Measurement(
+                number_per_run=number,
+                raw_times=[self._timer.timeit(number=number)],
+                task_spec=self._task_spec
+            )
+
+    def repeat(self, repeat=-1, number=-1):
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def autorange(self, callback=None):
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def _threaded_measurement_loop(
+        self,
+        number: int,
+        time_hook: Callable[[], float],
+        stop_hook: Callable[[List[float]], bool],
+        min_run_time: float,
+        max_run_time: Optional[float] = None,
+        callback: Optional[Callable[[int, float], NoReturn]] = None
+    ):
+        total_time = 0.0
+        can_stop = False
+        times: List[float] = []
+        with common.set_torch_threads(self._task_spec.num_threads):
+            while (total_time < min_run_time) or (not can_stop):
+                time_spent = time_hook()
+                times.append(time_spent)
+                total_time += time_spent
+                if callback:
+                    callback(number, time_spent)
+                can_stop = stop_hook(times)
+                if max_run_time and total_time > max_run_time:
+                    break
+        return times
+
+    def _estimate_block_size(self, min_run_time: float):
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Estimate the block size needed for measurement to be negligible
+            # compared to the inner loop. This also serves as a warmup.
+            overhead = np.median([self._timer.timeit(0) for _ in range(5)])
+            number = 1
+            while True:
+                time_taken = self._timer.timeit(number)
+                relative_overhead = overhead / time_taken
+                if relative_overhead <= 1e-4 and time_taken >= min_run_time / 1000:
+                    break
+                if time_taken > min_run_time:
+                    break
+                number *= 10
+        return number
+
+    def adaptive_autorange(
+            self,
+            threshold=0.1,
+            max_run_time=10,
+            callback: Optional[Callable[[int, float], NoReturn]] = None,
+            min_run_time=0.01
+    ):
+        number = self._estimate_block_size(min_run_time=0.05)
+
+        def time_hook() -> float:
+            return self._timer.timeit(number)
+
+        def stop_hook(times) -> bool:
+            if len(times) > 3:
+                return common.Measurement(
+                    number_per_run=number,
+                    raw_times=times,
+                    task_spec=self._task_spec
+                ).meets_confidence(threshold=threshold)
+            return False
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    def blocked_autorange(self, callback=None, min_run_time=0.2):
+        number = self._estimate_block_size(min_run_time)
+
+        def time_hook() -> float:
+            return self._timer.timeit(number)
+
+        def stop_hook(times) -> bool:
+            return True
+
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook,
+            min_run_time=min_run_time,
+            callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    def collect_callgrind(self, number=100, collect_baseline=True):
+        if not isinstance(self._task_spec.stmt, str):
+            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
+
+        # __init__ adds torch, and Timer adds __builtins__
+        allowed_keys = {"torch", "__builtins__"}
+        if any(k not in allowed_keys for k in self._globals.keys()):
+            raise ValueError(
+                "`collect_callgrind` does not currently support passing globals. "
+                "Please define a `setup` str instead.")
+
+        if self._globals.get("torch", torch) is not torch:
+            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
+
+        # Check that the statement is valid. It doesn't guarantee success, but it's much
+        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
+        # the parent process rather than the valgrind subprocess.
+        self._timer.timeit(1)
+        return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
+            stmt=self._task_spec.stmt,
+            setup=self._task_spec.setup,
+            number=number,
+            num_threads=self._task_spec.num_threads,
+            collect_baseline=collect_baseline)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py b/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
new file mode 100644
index 000000000000..423cc3dc4a86
--- /dev/null
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -0,0 +1,473 @@
+"""Intermediate layer between `Timer` and `valgrind`."""
+import collections
+import dataclasses
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple
+
+import torch
+
+
+FunctionCount = NamedTuple("FunctionCount", [("count", int), ("function", str)])
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class CallgrindStats(object):
+    stmt: str
+    setup: str
+    number_per_run: int
+    num_threads: int
+    built_with_debug_symbols: bool
+    baseline_inclusive_stats: Tuple[FunctionCount, ...]
+    baseline_exclusive_stats: Tuple[FunctionCount, ...]
+    stmt_inclusive_stats: Tuple[FunctionCount, ...]
+    stmt_exclusive_stats: Tuple[FunctionCount, ...]
+
+    def __repr__(self) -> str:
+        newline = "\n"  # `\` cannot appear in fstring code section.
+        base_stats = self.baseline_exclusive_stats
+        self_stats = self.stmt_exclusive_stats
+        output = textwrap.dedent(f"""
+        {super().__repr__()}
+          stmt:  {self.stmt.replace(newline, newline + ' ' * 9)}
+          setup: {self.setup.replace(newline, newline + ' ' * 9)}
+          {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+        {'':>25}All{'':>10}Noisy symbols removed
+          Instructions: {self._counts(self_stats, True):>12}{'':>15}{self._counts(self_stats, False):>12}
+          Baseline:     {self._counts(base_stats, True):>12}{'':>15}{self._counts(base_stats, False):>12}
+        """).strip()
+        if not self.built_with_debug_symbols:
+            output += textwrap.dedent("""
+            Warning: PyTorch was not built with debug symbols.
+                     Source information may be limited. Rebuild with
+                     REL_WITH_DEB_INFO=1 for more detailed results.""")
+        return output
+
+    def stats(self, inclusive: bool = False) -> Tuple[FunctionCount, ...]:
+        """Returns stats as a tuple of (count, function)
+
+        `inclusive` matches the semantics of callgrind. If True, the counts
+        include instructions executed by children. `inclusive=True` is useful
+        for identifying hot spots in code; `inclusive=False` is useful for
+        identifying reducing noise when diffing counts from two different
+        runs. (See CallgrindStats.delta(...) for more details)
+        """
+        if inclusive:
+            first, second = self.stmt_inclusive_stats, self.baseline_inclusive_stats
+        else:
+            first, second = self.stmt_exclusive_stats, self.baseline_exclusive_stats
+        return self._diff(first, second)
+
+    def counts(self, *, include_lookdict_unicode: bool = True) -> int:
+        """Returns the total number of instructions executed.
+
+        Several instructions in the CPython interpreter are rather noisy. These
+        instructions involve unicode to dictionary lookups which Python uses to
+        map variable names. By default these are included, but setting
+        `include_lookdict_unicode=False` will exclude them and generally lead
+        to less noisy counts.
+        """
+        return self._counts(self.stmt_exclusive_stats, include_lookdict_unicode)
+
+    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
+    def delta(
+        self,
+        other,  # type: CallgrindStats
+        inclusive: bool = False,
+        subtract_baselines: bool = True
+    ) -> Tuple[FunctionCount, ...]:
+        """Diff two sets of counts.
+
+        One common reason to collect instruction counts is to determine the
+        the effect that a particular change will have on the number of instructions
+        needed to perform some unit of work. If a change increases that number, the
+        next logical question is "why". This generally involves looking at what part
+        if the code increased in instruction count. This function automates that
+        process so that one can easily diff counts on both an inclusive and
+        exclusive basis. The `subtract_baselines` argument allows one to disable
+        baseline correction, though in most cases it shouldn't matter as the
+        baselines are expected to more or less cancel out.
+        """
+        if subtract_baselines:
+            first = self.stats(inclusive=inclusive)
+            second = other.stats(inclusive=inclusive)
+        else:
+            if inclusive:
+                first, second = self.stmt_inclusive_stats, other.stmt_inclusive_stats
+            else:
+                first, second = self.stmt_exclusive_stats, other.stmt_exclusive_stats
+        return self._diff(first, second)
+
+    def as_standardized(self) -> "CallgrindStats":
+        """Strip library names and some prefixes from function strings.
+
+        When comparing two different sets of instruction counts, on stumbling
+        block can be path prefixes. Callgrind includes the full filepath
+        when reporting a function (as it should). However, this can cause
+        issues when diffing profiles. If a key component such as Python
+        or PyTorch was built in separate locations in the two profiles, which
+        can result in something resembling:
+            23234231 /tmp/first_build_dir/thing.c:foo(...)
+             9823794 /tmp/first_build_dir/thing.c:bar(...)
+              ...
+               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
+              ...
+             -9823794 /tmp/second_build_dir/thing.c:bar(...)
+            -23234231 /tmp/second_build_dir/thing.c:foo(...)
+
+        Stripping prefixes can ameliorate this issue by regularizing the
+        strings and causing better cancellation of equivilent call sites
+        when diffing.
+        """
+        def strip(stats: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+            counts: DefaultDict[str, int] = collections.defaultdict(int)
+
+            # "Python" and "Objects" come from CPython.
+            prefix_truncations = ("build/aten/", "Python/", "Objects/")
+            for c, fn in stats:
+                fn = re.sub(r"^.+build/\.\./", "build/../", fn)
+                for new_prefix in prefix_truncations:
+                    fn = re.sub(r"^.+/" + re.escape(new_prefix), new_prefix, fn)
+
+                # Strip library name. e.g. `libtorch.so`
+                fn = re.sub(r"\s\[.+\]$", "", fn)
+                counts[fn] += c
+            return tuple(sorted([
+                FunctionCount(c, fn) for fn, c in counts.items() if c
+            ], reverse=True))
+
+        return CallgrindStats(
+            stmt=self.stmt,
+            setup=self.setup,
+            number_per_run=self.number_per_run,
+            num_threads=self.num_threads,
+            built_with_debug_symbols=self.built_with_debug_symbols,
+            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
+            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
+            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
+            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
+        )
+
+    @staticmethod
+    def _counts(stats: Tuple[FunctionCount, ...], include_lookdict_unicode: bool) -> int:
+        return sum(
+            c for c, fn in stats
+            if include_lookdict_unicode
+            or "dictobject.c:lookdict_unicode" not in fn
+        )
+
+    @staticmethod
+    def _diff(first: Tuple[FunctionCount, ...], second: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+        counts = collections.defaultdict(int, {fn: c for c, fn in first})
+        assert len(counts) == len(first)
+        for c, fn in second:
+            counts[fn] -= c
+
+        return tuple(sorted([
+            FunctionCount(c, fn) for fn, c in counts.items() if c
+        ], reverse=True))
+
+
+class _ValgrindWrapper(object):
+    def __init__(self) -> None:
+        self._commands_available: Dict[str, bool] = {}
+        if torch._C.valgrind_supported_platform():
+            # Only bother checking on supported platforms.
+            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
+                self._commands_available[cmd] = not subprocess.run(
+                    ["which", cmd],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                ).returncode
+
+        self._build_type: Optional[str] = None
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())
+        if build_search is not None:
+            self._build_type = build_search.groups()[0].split(",")[0]
+
+        self._baseline_cache: Dict[Tuple[int, int], Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]] = {}
+
+    def _validate(self) -> None:
+        if not torch._C.valgrind_supported_platform():
+            raise OSError("Valgrind is not supported on this platform.")
+
+        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
+        if missing_cmds:
+            raise OSError("Missing: " + ", ".join(missing_cmds))
+
+    def collect_callgrind(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        collect_baseline: bool
+    ) -> CallgrindStats:
+        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
+        self._validate()
+        baseline_inclusive_stats: Tuple[FunctionCount, ...] = ()
+        baseline_exclusive_stats: Tuple[FunctionCount, ...] = ()
+        if collect_baseline:
+            cache_key = (number, num_threads)
+            if cache_key not in self._baseline_cache:
+                self._baseline_cache[cache_key] = self._invoke(
+                    stmt="pass", setup="pass", number=number, num_threads=num_threads)
+            baseline_inclusive_stats, baseline_exclusive_stats = \
+                self._baseline_cache[cache_key]
+
+        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
+            stmt=stmt,
+            setup=setup,
+            number=number,
+            num_threads=num_threads
+        )
+
+        return CallgrindStats(
+            stmt=stmt,
+            setup=setup,
+            number_per_run=number,
+            num_threads=num_threads,
+            built_with_debug_symbols=self._build_type == "RelWithDebInfo",
+            baseline_inclusive_stats=baseline_inclusive_stats,
+            baseline_exclusive_stats=baseline_exclusive_stats,
+            stmt_inclusive_stats=stmt_inclusive_stats,
+            stmt_exclusive_stats=stmt_exclusive_stats,
+        )
+
+    def _invoke(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int
+    ) -> Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]:
+        """Core invocation method for Callgrind collection.
+
+        Valgrind operates by effectively replacing the CPU with an emulated
+        version which allows it to instrument any code at the cost of severe
+        performance degradation. This has the practical effect that in order
+        to collect Callgrind statistics, a new process has to be created
+        running under `valgrind`. The steps for this process are:
+
+        1) Create a scratch directory.
+        2) Codegen a run script. (_ValgrindWrapper._construct_script)
+            Inside the run script:
+                * Validate that Python and torch match the parent process
+                * Validate that it is indeed running under valgrind
+                * Execute `setup` and warm up `stmt`
+                * Begin collecting stats
+                * Run the `stmt` loop
+                * Stop collecting stats
+        3) Parse the run results.
+        4) Cleanup the scratch directory.
+        """
+        working_dir = tempfile.mkdtemp()
+        script_file = os.path.join(working_dir, "timer_callgrind.py")
+        callgrind_out = os.path.join(working_dir, "callgrind.out")
+        error_log = os.path.join(working_dir, "error.txt")
+        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
+        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
+
+        def run(args: List[str], **kwargs: Any) -> Tuple[subprocess.CompletedProcess, str]:
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
+            f_stdout_stderr = open(stdout_stderr_log, "wb")
+            try:
+                invocation = subprocess.run(
+                    args,
+                    stdout=f_stdout_stderr,
+                    stderr=subprocess.STDOUT,
+                    **kwargs,
+                )
+                with open(stdout_stderr_log, "rt") as f:
+                    return invocation, f.read()
+            finally:
+                f_stdout_stderr.close()
+
+        try:
+            with open(script_file, "wt") as f:
+                f.write(self._construct_script(
+                    stmt=stmt, setup=setup, number=number,
+                    num_threads=num_threads, error_log=error_log,
+                    stat_log=stat_log))
+
+            valgrind_invocation, valgrind_invocation_output = run([
+                "valgrind",
+                "--tool=callgrind",
+                f"--callgrind-out-file={callgrind_out}",
+                "--dump-line=yes",
+                "--dump-instr=yes",
+                "--instr-atstart=yes",
+                "--collect-atstart=no",
+                "python",
+                script_file,
+            ])
+
+            if valgrind_invocation.returncode:
+                error_report = ""
+                if os.path.exists(error_log):
+                    with open(error_log, "rt") as f:
+                        error_report = f.read()
+                if not error_report:
+                    error_report = "Unknown error.\n" + valgrind_invocation_output
+
+                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
+
+            def parse_output(inclusive: bool) -> Tuple[FunctionCount, ...]:
+                annotate_invocation, annotate_invocation_output = run([
+                    "callgrind_annotate",
+                    f"--inclusive={'yes' if inclusive else 'no'}",
+                    callgrind_out
+                ], check=True)
+
+                begin_collecting = False
+                fn_counts = []
+                for l in annotate_invocation_output.splitlines(keepends=False):
+                    if not begin_collecting and re.match(r"Ir\s+file:function", l):
+                        begin_collecting = True
+                        continue
+
+                    count_match = re.match(r"^\s*([0-9,]+)\s+(.+:.+)$", l)
+                    if count_match:
+                        ir_str, file_function = count_match.groups()
+                        ir = int(ir_str.replace(",", ""))
+                        fn_counts.append(FunctionCount(ir, file_function))
+                        continue
+
+                    if begin_collecting and re.match(r"-+", l):
+                        continue
+
+                    begin_collecting = False
+
+                return tuple(fn_counts)
+            return parse_output(inclusive=True), parse_output(inclusive=False)
+        finally:
+            shutil.rmtree(working_dir)
+
+    @staticmethod
+    def _construct_script(
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        error_log: str,
+        stat_log: str
+    ) -> str:
+        # The naive template looks something like:
+        #   "for _ in range({number}): {stmt}"
+        # However a loop in Python is surprisingly expensive, and significantly
+        # increases the number of background Python instructions. So instead we
+        # partially unroll the loops, with a block size of 100 chosen to keep
+        # the instruction overhead from `range` low while also not ballooning
+        # the size of the generated file.
+        block_size = 100
+        loop_count = number // block_size
+        remainder = number - block_size * loop_count
+        blocked_stmt = ""
+        if loop_count:
+            unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
+            blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
+        if remainder:
+            blocked_stmt += "\n".join([stmt] * remainder)
+
+        return textwrap.dedent(r"""
+            import gc
+            import os
+            import subprocess
+            import sys
+            import time
+
+            import torch
+            torch.set_num_threads({num_threads})
+
+            PID = os.getpid()
+
+            def log_failure(msg):
+                with open({error_log_repr}, "wt") as f:
+                    f.write(msg)
+                sys.exit(1)
+
+            def check_result(completed_process):
+                if completed_process.returncode:
+                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
+                return completed_process
+
+            # =============================================================================
+            # == Check that subprocess matches parent =====================================
+            # =============================================================================
+            if sys.executable != "{parent_interpreter}":
+                log_failure(
+                    "Interpreter mismatch:\n"
+                    f"  {{sys.executable}}\n    vs.\n  {parent_interpreter}"
+                )
+
+            if torch.__file__ != "{torch_file}":
+                log_failure(
+                    "PyTorch does not match expected file:\n"
+                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
+                )
+
+            # =============================================================================
+            # == User specified setup =====================================================
+            # =============================================================================
+            {setup}
+
+            for _ in range({warmup_number}):
+            {indented_stmt}
+
+            # =============================================================================
+            # == Callgrind management =====================================================
+            # =============================================================================
+            with open("{stat_log}", "wb") as stat_file:
+                # If many instances of callgrind are running at once, the output of
+                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
+                # to deadlock. So instead we use a file.
+                callgrind_stat = check_result(subprocess.run(
+                    ["callgrind_control", "--stat"],
+                    stdout=stat_file,
+                    stderr=subprocess.STDOUT,
+                ))
+
+            with open("{stat_log}", "rt") as stat_file:
+                stat_lines = stat_file.read().splitlines()
+
+            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
+                log_failure("Process does not appear to be running callgrind.")
+
+            gc.collect()
+            time.sleep(0.01)
+
+            # =============================================================================
+            # == User code block ==========================================================
+            # =============================================================================
+            torch._C.valgrind_toggle()
+            {blocked_stmt}
+
+            # Sleep is to allow the interpreter to catch up before we stop collecting in
+            # order to reduce jitter.
+            time.sleep(0.01)
+            torch._C.valgrind_toggle()
+        """).strip().format(
+            indented_stmt=textwrap.indent(stmt, " " * 4),
+            blocked_stmt=blocked_stmt,
+            number=number,
+            setup=setup,
+            warmup_number=min(number, 10),
+            num_threads=num_threads,
+            error_log_repr=repr(error_log),
+            stat_log=stat_log,
+            parent_interpreter=sys.executable,
+            torch_file=torch.__file__,
+        )
+
+
+CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+def wrapper_singleton() -> _ValgrindWrapper:
+    global CALLGRIND_SINGLETON
+    if CALLGRIND_SINGLETON is None:
+        CALLGRIND_SINGLETON = _ValgrindWrapper()
+    return CALLGRIND_SINGLETON
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 1b3d50a8244d..feecc39acd87 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -152,6 +152,7 @@ def _join_rocm_home(*paths) -> str:
 COMMON_NVCC_FLAGS = [
     '-D__CUDA_NO_HALF_OPERATORS__',
     '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
     '-D__CUDA_NO_HALF2_OPERATORS__',
     '--expt-relaxed-constexpr'
 ]
@@ -1389,11 +1390,11 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
         ('Pascal', '6.0;6.1+PTX'),
         ('Volta', '7.0+PTX'),
         ('Turing', '7.5+PTX'),
-        ('Ampere', '8.0+PTX'),
+        ('Ampere', '8.0;8.6+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0']
+                        '7.0', '7.2', '7.5', '8.0', '8.6']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 92f019bae1b9..0a0e7d98cdfb 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -1,11 +1,12 @@
 from .sampler import Sampler, SequentialSampler, RandomSampler, SubsetRandomSampler, WeightedRandomSampler, BatchSampler
-from .dataset import Dataset, IterableDataset, TensorDataset, ConcatDataset, ChainDataset, Subset, random_split
+from .dataset import (Dataset, IterableDataset, TensorDataset, ConcatDataset, ChainDataset, BufferedShuffleDataset, 
+                      Subset, random_split)
 from .distributed import DistributedSampler
 from .dataloader import DataLoader, _DatasetKind, get_worker_info
 
 
 __all__ = ['Sampler', 'SequentialSampler', 'RandomSampler',
-           'SubsetRandomSampler', 'WeightedRandomSampler', 'BatchSampler'
-           'DistributedSampler' 'Dataset', 'IterableDataset', 'TensorDataset',
-           'ConcatDataset', 'ChainDataset', 'Subset', 'random_split'
-           'DataLoader', '_DatasetKind', 'get_worker_info']
+           'SubsetRandomSampler', 'WeightedRandomSampler', 'BatchSampler',
+           'DistributedSampler', 'Dataset', 'IterableDataset', 'TensorDataset',
+           'ConcatDataset', 'ChainDataset', 'BufferedShuffleDataset', 'Subset',
+           'random_split', 'DataLoader', '_DatasetKind', 'get_worker_info']
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 508c03950852..8802f40ecdb9 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -10,6 +10,7 @@
 from collections import namedtuple
 from torch._six import queue
 from torch._utils import ExceptionWrapper
+from typing import Union
 from . import signal_handling, MP_STATUS_CHECK_INTERVAL, IS_WINDOWS
 
 if IS_WINDOWS:
@@ -23,7 +24,8 @@ class ManagerWatchdog(object):
         def __init__(self):
             self.manager_pid = os.getppid()
 
-            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
+            # mypy cannot detect this code is windows only
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)  # type: ignore
             self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
             self.kernel32.OpenProcess.restype = HANDLE
             self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
@@ -34,7 +36,7 @@ def __init__(self):
             self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
 
             if not self.manager_handle:
-                raise ctypes.WinError(ctypes.get_last_error())
+                raise ctypes.WinError(ctypes.get_last_error())  # type: ignore
 
             self.manager_dead = False
 
@@ -187,6 +189,7 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
                 # processing steps.
                 continue
             idx, index = r
+            data: Union[_IterableDatasetStopIteration, ExceptionWrapper]
             if init_exception is not None:
                 data = init_exception
                 init_exception = None
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 7f466d18dcc8..c910cab9aef8 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -1,4 +1,5 @@
 import bisect
+import random
 import warnings
 
 from torch._utils import _accumulate
@@ -253,6 +254,63 @@ def __len__(self):
         return total
 
 
+class BufferedShuffleDataset(IterableDataset[T_co]):
+    r"""Dataset shuffled from the original dataset.
+
+    This class is useful to shuffle an existing instance of an IterableDataset.
+    The buffer with `buffer_size` is filled with the items from the dataset first. Then,
+    each item will be yielded from the buffer by reservoir sampling via iterator.
+
+    `buffer_size` is required to be larger than 0. For `buffer_size == 1`, the
+    dataset is not shuffled. In order to fully shuffle the whole dataset, `buffer_size`
+    is required to be greater than or equal to the size of dataset.
+
+    When it is used with :class:`~torch.utils.data.DataLoader`, each item in the
+    dataset will be yielded from the :class:`~torch.utils.data.DataLoader` iterator.
+    And, the method to set up a random seed is different based on :attr:`num_workers`.
+
+    For single-process mode (:attr:`num_workers == 0`), the random seed is required to
+    be set before the :class:`~torch.utils.data.DataLoader` in the main process.
+
+        >>> ds = BufferedShuffleDataset(dataset)
+        >>> random.seed(...)
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+
+    For multi-process mode (:attr:`num_workers > 0`), the random seed is set by a callable
+    function in each worker.
+
+        >>> ds = BufferedShuffleDataset(dataset)
+        >>> def init_fn(worker_id):
+        ...     random.seed(...)
+        >>> print(list(torch.utils.data.DataLoader(ds, ..., num_workers=n, worker_init_fn=init_fn)))
+
+    Arguments:
+        dataset (IterableDataset): The original IterableDataset.
+        buffer_size (int): The buffer size for shuffling.
+    """
+    dataset: IterableDataset[T_co]
+    buffer_size: int
+
+    def __init__(self, dataset: IterableDataset[T_co], buffer_size: int) -> None:
+        super(BufferedShuffleDataset, self).__init__()
+        assert buffer_size > 0, "buffer_size should be larger than 0"
+        self.dataset = dataset
+        self.buffer_size = buffer_size
+
+    def __iter__(self) -> Iterator[T_co]:
+        buf: List[T_co] = []
+        for x in self.dataset:
+            if len(buf) == self.buffer_size:
+                idx = random.randint(0, self.buffer_size - 1)
+                yield buf[idx]
+                buf[idx] = x
+            else:
+                buf.append(x)
+        random.shuffle(buf)
+        while buf:
+            yield buf.pop()
+
+
 class Subset(Dataset[T_co]):
     r"""
     Subset of a dataset at specified indices.
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index b4f5ac399504..7e9456aa523a 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -74,15 +74,17 @@ def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
         self.drop_last = drop_last
         # If the dataset length is evenly divisible by # of replicas, then there
         # is no need to drop any data, since the dataset will be split equally.
-        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore
             # Split to nearest available length that is evenly divisible.
             # This is to ensure each rank receives the same amount of data when
             # using this Sampler.
             self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas
+                # `type:ignore` is required because Dataset cannot provide a default __len__
+                # see NOTE in pytorch/torch/utils/data/sampler.py
+                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore
             )
         else:
-            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore
         self.total_size = self.num_samples * self.num_replicas
         self.shuffle = shuffle
         self.seed = seed
@@ -92,9 +94,9 @@ def __iter__(self) -> Iterator[T_co]:
             # deterministically shuffle based on epoch and seed
             g = torch.Generator()
             g.manual_seed(self.seed + self.epoch)
-            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore
         else:
-            indices = list(range(len(self.dataset)))
+            indices = list(range(len(self.dataset)))  # type: ignore
 
         if not self.drop_last:
             # add extra samples to make it evenly divisible
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 52aad5ea1d69..1ce4a943c45b 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -34,11 +34,13 @@
 from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
 from .cuda_to_hip_mappings import MATH_TRANSPILATIONS
 
+from typing import Dict, List
+from collections.abc import Mapping
+
 # Hardcode the PyTorch template map
 """This dictionary provides the mapping from PyTorch kernel template types
 to their actual types."""
 PYTORCH_TEMPLATE_MAP = {"Dtype": "scalar_t", "T": "scalar_t"}
-CAFFE2_TEMPLATE_MAP = {}
 
 
 class InputError(Exception):
@@ -168,7 +170,7 @@ def preprocess(
         clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
 
     # Preprocessing statistics.
-    stats = {"unsupported_calls": [], "kernel_launches": []}
+    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
 
     for filepath in all_files:
         result = preprocessor(output_directory, filepath, stats, hip_clang_launch, is_pytorch_extension, clean_ctx)
@@ -204,7 +206,7 @@ def add_dim3(kernel_string, cuda_kernel):
     count = 0
     closure = 0
     kernel_string = kernel_string.replace("<<<", "").replace(">>>", "")
-    arg_locs = [{} for _ in range(2)]
+    arg_locs: List[Dict[str, int]] = [{} for _ in range(2)]
     arg_locs[count]['start'] = 0
     for ind, c in enumerate(kernel_string):
         if count > 1:
@@ -444,6 +446,7 @@ def hip_header_magic(input_string):
         return output_string
 
     # Rough logic to detect if we're inside device code
+    hasDeviceLogic: int
     hasDeviceLogic = "hipLaunchKernelGGL" in output_string
     hasDeviceLogic += "__global__" in output_string
     hasDeviceLogic += "__shared__" in output_string
@@ -632,6 +635,7 @@ def pattern(self):
 PYTORCH_TRIE = Trie()
 PYTORCH_MAP = {}
 for mapping in CUDA_TO_HIP_MAPPINGS:
+    assert isinstance(mapping, Mapping)
     for src, value in mapping.items():
         dst = value[0]
         meta_data = value[1:]
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 51d524d63acf..0e2498d64c56 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -4,6 +4,7 @@
 import pprint
 import zipfile
 import fnmatch
+from typing import IO, BinaryIO, Union
 
 
 class FakeObject(object):
@@ -44,7 +45,7 @@ class FakeClass(object):
     def __init__(self, module, name):
         self.module = module
         self.name = name
-        self.__new__ = self.fake_new
+        self.__new__ = self.fake_new  # type: ignore
 
     def __repr__(self):
         return f"{self.module}.{self.name}"
@@ -56,7 +57,7 @@ def fake_new(self, *args):
         return FakeObject(self.module, self.name, args[1:])
 
 
-class DumpUnpickler(pickle._Unpickler):
+class DumpUnpickler(pickle._Unpickler):  # type: ignore
     def find_class(self, module, name):
         return FakeClass(module, name)
 
@@ -84,6 +85,7 @@ def main(argv, output_stream=None):
         return 2
 
     fname = argv[1]
+    handle: Union[IO[bytes], BinaryIO]
     if "@" not in fname:
         with open(fname, "rb") as handle:
             DumpUnpickler.dump(handle, output_stream)
@@ -110,6 +112,6 @@ def main(argv, output_stream=None):
     # I've tested on the following versions:
     #   3.7.4
     if True:
-        pprint.PrettyPrinter._dispatch[FakeObject.__repr__] = FakeObject.pp_format
+        pprint.PrettyPrinter._dispatch[FakeObject.__repr__] = FakeObject.pp_format  # type: ignore
 
     sys.exit(main(sys.argv))
diff --git a/torch/utils/tensorboard/_caffe2_graph.py b/torch/utils/tensorboard/_caffe2_graph.py
index 64f9f11f06d9..218f2382c86c 100644
--- a/torch/utils/tensorboard/_caffe2_graph.py
+++ b/torch/utils/tensorboard/_caffe2_graph.py
@@ -12,8 +12,10 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 
+from typing import Set, Dict, Tuple, List
 
-def _make_unique_name(seen, name, min_version=0):
+
+def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
     '''
     Make the name unique by appending a unique number to the name. Used for SSA.
 
@@ -91,12 +93,12 @@ def _convert_to_ssa(shapes, blob_name_tracker, ops):
         None. Modifies blob_name_tracker and ops in-place.
     '''
     ir = core.IR(ops)
-    seen = set()
-    versioned = {}
+    seen: Set[str] = set()
+    versioned: Dict[Tuple[str, int], int] = {}
     new_shapes = {}
     new_blob_name_tracker = {}
 
-    def ssa_name(name, versions):
+    def ssa_name(name: str, versions: Dict[str, int]) -> int:
         assert name in versions
         version = versions[name]
         if (name, version) in versioned:
@@ -180,8 +182,8 @@ def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
         None. Modifies shapes, blob_name_tracker and ops in-place using the
             specified 'rename_fn'.
     '''
-    seen = set()
-    renamed = {}
+    seen: Set[str] = set()
+    renamed: Dict[Tuple[str, int], int] = {}
 
     def g(name):
         """ Collision-free version of f.
@@ -683,7 +685,7 @@ def _operators_to_graph_def(
     _fill_missing_operator_names(ops)
     if show_simplified:  # use_tensorflow_naming
         _rename_tensorflow_style(shapes, blob_name_tracker, ops)
-    producing_ops = {}
+    producing_ops: Dict[caffe2_pb2.OperatorDef, List] = {}
     blobs = set()
     input_blobs, inter_blobs, _ = _compute_in_out(ops)
     current_graph = GraphDef()
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 8afd94febf4b..c52776a89448 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -489,27 +489,22 @@ def audio(tag, tensor, sample_rate=44100):
         print('warning: audio amplitude out of range, auto clipped.')
         tensor = tensor.clip(-1, 1)
     assert(tensor.ndim == 1), 'input tensor should be 1 dimensional.'
+    tensor = (tensor * np.iinfo(np.int16).max).astype('<i2')
 
-    tensor_list = [int(32767.0 * x) for x in tensor]
     import io
     import wave
-    import struct
     fio = io.BytesIO()
     wave_write = wave.open(fio, 'wb')
     wave_write.setnchannels(1)
     wave_write.setsampwidth(2)
     wave_write.setframerate(sample_rate)
-    tensor_enc = b''
-    for v in tensor_list:
-        tensor_enc += struct.pack('<h', v)
-
-    wave_write.writeframes(tensor_enc)
+    wave_write.writeframes(tensor.data)
     wave_write.close()
     audio_string = fio.getvalue()
     fio.close()
     audio = Summary.Audio(sample_rate=sample_rate,
                           num_channels=1,
-                          length_frames=len(tensor_list),
+                          length_frames=tensor.shape[-1],
                           encoded_audio_string=audio_string,
                           content_type='audio/wav')
     return Summary(value=[Summary.Value(tag=tag, audio=audio)])
diff --git a/version.txt b/version.txt
index 56fee0696648..52d893bfbb54 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.7.0a0
+1.8.0a0